/// <summary> /// Loads the special expressions dictionary. /// </summary> private void LoadSpecialExpressions() { _specialExpressions = new Dictionary <string, SpecialExpression>(); foreach (string line in File.ReadAllLines(PathHelper.SpecialReadingsPath)) { if (string.IsNullOrWhiteSpace(line) || line.First() == ';') { continue; } string[] split = line.Split(SeparatorHelper.FileFieldSeparator); string kanjiReading = split[0]; string kanaReading = split[1]; VocabEntry v = new VocabEntry(kanjiReading, kanaReading); // Read the solution if it is explicitly written. Compute it otherwise. FuriganaSolution solution = split.Count() == 3 ? FuriganaSolution.Parse(split[2], v) : new FuriganaSolution(v, new FuriganaPart(kanaReading, 0, kanjiReading.Length - 1)); // Add the special reading or special expression. SpecialReading specialReading = new SpecialReading(kanaReading, solution); if (_specialExpressions.ContainsKey(kanjiReading)) { _specialExpressions[kanjiReading].Readings.Add(specialReading); } else { _specialExpressions.Add(kanjiReading, new SpecialExpression(kanjiReading, specialReading)); } } }
public void Write(IEnumerable <FuriganaSolutionSet> solutions) { int success = 0, total = 0; log4net.ILog logger = log4net.LogManager.GetLogger("Writer"); DateTime start = DateTime.Now; string jsonFileName = $"{Path.GetFileNameWithoutExtension(OutputPath)}.json"; using (var stream = new StreamWriter(OutputPath, false, Encoding.UTF8)) using (var jsonStream = new StreamWriter(jsonFileName, false, Encoding.UTF8)) using (var jsonWriter = new JsonTextWriter(jsonStream)) { jsonWriter.WriteStartArray(); var jsonSerializer = new JsonSerializer(); jsonSerializer.Converters.Add(new FuriganaSolutionJsonSerializer()); foreach (FuriganaSolutionSet solution in solutions) { FuriganaSolution singleSolution = solution.GetSingleSolution(); if (solution.Any()) { if (singleSolution == null) { logger.InfoFormat("➕ {0}", solution); } else { logger.InfoFormat("◯ {0}", solution); } } else { logger.InfoFormat("X {0}|{1}|???", solution.Vocab.KanjiReading, solution.Vocab.KanaReading); } if (singleSolution != null && !AlreadyWritten.Contains(singleSolution.ToString())) { stream.WriteLine(singleSolution.ToString()); AlreadyWritten.Add(singleSolution.ToString()); jsonSerializer.Serialize(jsonWriter, singleSolution); } if (singleSolution != null) { success++; } total++; } jsonWriter.WriteEndArray(); } TimeSpan duration = DateTime.Now - start; logger.InfoFormat("Successfuly ended process with {0} out of {1} successfuly found furigana strings.", success, total); logger.InfoFormat("Process took {0} seconds.", duration.TotalSeconds); }
public void Write(IEnumerable <FuriganaSolutionSet> solutions) { int success = 0, total = 0; var logger = LogManager.GetCurrentClassLogger(); DateTime start = DateTime.Now; string jsonFileName = $"{Path.GetFileNameWithoutExtension(OutputPath)}.json"; using (var stream = new StreamWriter(OutputPath, false, Encoding.UTF8)) using (var jsonStream = new StreamWriter(jsonFileName, false, Encoding.UTF8)) using (var jsonWriter = new JsonTextWriter(jsonStream)) { jsonWriter.WriteStartArray(); var jsonSerializer = new JsonSerializer(); jsonSerializer.Converters.Add(new FuriganaSolutionJsonSerializer()); foreach (FuriganaSolutionSet solution in solutions) { FuriganaSolution singleSolution = solution.GetSingleSolution(); if (solution.Any()) { if (singleSolution == null) { logger.Info($"➕ {solution}"); } else { logger.Info($"◯ {solution}"); } } else { logger.Info($"X {solution.Vocab.KanjiReading}|{solution.Vocab.KanaReading}|???"); } if (singleSolution != null && !AlreadyWritten.Contains(singleSolution.ToString())) { stream.WriteLine(singleSolution.ToString()); AlreadyWritten.Add(singleSolution.ToString()); jsonSerializer.Serialize(jsonWriter, singleSolution); } if (singleSolution != null) { success++; } total++; } jsonWriter.WriteEndArray(); } TimeSpan duration = DateTime.Now - start; logger.Info($"Successfuly ended process with {success} out of {total} successfuly found furigana strings."); logger.Info($"Process took {duration}."); }
public void Test_BreakIntoParts_Akagaeruka() { var vocab = new VocabEntry("アカガエル科", "アカガエルか"); var solution = new FuriganaSolution(vocab, new FuriganaPart("か", 5)); var parts = solution.BreakIntoParts().ToList(); Assert.AreEqual(2, parts.Count); Assert.AreEqual("アカガエル", parts[0].Text); Assert.IsNull(parts[0].Furigana); Assert.AreEqual("科", parts[1].Text); Assert.AreEqual("か", parts[1].Furigana); }
/// <summary> /// Attempts to solve furigana by looking up for solutions in the override list. /// </summary> protected override IEnumerable <FuriganaSolution> DoSolve(FuriganaResourceSet r, VocabEntry v) { FuriganaSolution solution = r.GetOverride(v); if (solution != null) { yield return(new FuriganaSolution() { Furigana = solution.Furigana, Vocab = v }); } }
/// <summary> /// Loads the furigana override list. /// </summary> private void LoadOverrideList() { _overrideList = new Dictionary <string, FuriganaSolution>(); foreach (string line in File.ReadAllLines(PathHelper.OverrideFuriganaPath)) { if (string.IsNullOrWhiteSpace(line) || line.First() == ';') { continue; } string[] split = line.Split(SeparatorHelper.FileFieldSeparator); _overrideList.Add(new VocabEntry(split[0], split[1]).ToString(), FuriganaSolution.Parse(split[2], null)); } }
public void Write(IEnumerable <FuriganaSolutionSet> solutions) { int success = 0, total = 0; log4net.ILog logger = log4net.LogManager.GetLogger("Writer"); DateTime start = DateTime.Now; using (StreamWriter stream = new StreamWriter(OutputPath, false, Encoding.UTF8)) { foreach (FuriganaSolutionSet solution in solutions) { FuriganaSolution singleSolution = solution.GetSingleSolution(); if (solution.Any()) { if (singleSolution == null) { logger.InfoFormat("➕ {0}", solution); } else { logger.InfoFormat("◯ {0}", solution); } } else { logger.InfoFormat("X {0}|{1}|???", solution.Vocab.KanjiReading, solution.Vocab.KanaReading); } if (singleSolution != null && !AlreadyWritten.Contains(singleSolution.ToString())) { stream.WriteLine(singleSolution.ToString()); AlreadyWritten.Add(singleSolution.ToString()); } if (singleSolution != null) { success++; } total++; } } TimeSpan duration = DateTime.Now - start; logger.InfoFormat("Successfuly ended process with {0} out of {1} successfuly found furigana strings.", success, total); logger.InfoFormat("Process took {0} seconds.", duration.TotalSeconds); }
public void Test_Furigana(string kanjiReading, string kanaReading, string expectedFurigana) { VocabEntry v = new VocabEntry(kanjiReading, kanaReading); FuriganaBusiness business = new FuriganaBusiness(DictionaryFile.Jmdict); FuriganaSolutionSet result = business.Execute(v); if (result.GetSingleSolution() == null) { Assert.Fail(); } else { Assert.AreEqual(FuriganaSolution.Parse(expectedFurigana, v), result.GetSingleSolution()); } }
public void Test_BreakIntoParts_Otonagai() { var vocab = new VocabEntry("大人買い", "おとながい"); var solution = new FuriganaSolution(vocab, new FuriganaPart("おとな", 0, 1), new FuriganaPart("が", 2)); var parts = solution.BreakIntoParts().ToList(); Assert.AreEqual(3, parts.Count); Assert.AreEqual("大人", parts[0].Text); Assert.AreEqual("おとな", parts[0].Furigana); Assert.AreEqual("買", parts[1].Text); Assert.AreEqual("が", parts[1].Furigana); Assert.AreEqual("い", parts[2].Text); Assert.IsNull(parts[2].Furigana); }
/// <summary> /// Attempts to solve furigana in cases where there are no consecutive kanji in the kanji string, /// using regular expressions. /// </summary> protected override IEnumerable <FuriganaSolution> DoSolve(FuriganaResourceSet r, VocabEntry v) { // We are using both a greedy expression and a lazy expression because we want to make sure // there is only one way to read them. If the result differs with a greedy or a lazy expression, // it means that we have no idea how to read the damn thing. string regGreedy = "^"; string regLazy = "^"; bool consecutiveMarker = false; List <int> kanjiIndexes = new List <int>(4); for (int i = 0; i < v.KanjiReading.Length; i++) { char c = v.KanjiReading[i]; Kanji k = r.GetKanji(c); if (k == null) { // Add the characters to the string. No capture group for kana. regGreedy += string.Format(c.ToString()); regLazy += string.Format(c.ToString()); consecutiveMarker = false; } else if (consecutiveMarker) { // Consecutive kanji. The vocab entry is not eligible for this solution. yield break; } else { // Add the characters inside a capture group for kanji. regGreedy += "(.+)"; regLazy += "(.+?)"; consecutiveMarker = true; kanjiIndexes.Add(i); } } regGreedy += "$"; regLazy += "$"; // Example regex: // For 持ち運ぶ (もちはこぶ) // The regexes would be: // ^(.+)ち(.+)ぶ$ // ^(.+?)ち(.+?)ぶ$ Regex regexGreedy = new Regex(regGreedy); Regex regexLazy = new Regex(regLazy); Match matchGreedy = regexGreedy.Match(v.KanaReading); Match matchLazy = regexLazy.Match(v.KanaReading); if (matchGreedy.Success && matchLazy.Success) { // Obtain both solutions. FuriganaSolution greedySolution = MakeSolutionFromMatch(v, matchGreedy, kanjiIndexes); FuriganaSolution lazySolution = MakeSolutionFromMatch(v, matchLazy, kanjiIndexes); // Are both solutions non-null and equivalent? if (greedySolution != null && lazySolution != null && greedySolution.Equals(lazySolution)) { // Yes they are! Return only one of them of course. // Greedy wins obviously. yield return(greedySolution); } } }