public Composer(string sourcesFolder) { readRanks(Path.Combine(sourcesFolder, "junda-freq.txt"), true); readRanks(Path.Combine(sourcesFolder, "tsai-freq.txt"), false); pinyin = new Pinyin(Path.Combine(sourcesFolder, "pinyin.txt")); charReadingsSimp = new CharReadings(Path.Combine(sourcesFolder, "Unihan_Readings.txt"), ranksSimp, pinyin); charReadingsTrad = new CharReadings(Path.Combine(sourcesFolder, "Unihan_Readings.txt"), ranksTrad, pinyin); polyDict = new PolyDict(Path.Combine(sourcesFolder, "cedict.u8"), pinyin); }
public Composer(string sourcesFolder) { pinyin = new Pinyin(Path.Combine(sourcesFolder, "pinyin.txt")); JsonSerializer ser = new JsonSerializer(); using (StreamReader sr = new StreamReader(Path.Combine("wwwroot", "simp-map.json"))) { readingsSimp = ser.Deserialize(sr, typeof(List <CharReading>)) as List <CharReading>; } using (StreamReader sr = new StreamReader(Path.Combine("wwwroot", "trad-map.json"))) { readingsTrad = ser.Deserialize(sr, typeof(List <CharReading>)) as List <CharReading>; } //addPunctReadings(readingsSimp, true); //addPunctReadings(readingsTrad, false); }
public PolyDict(string fn, Pinyin pinyin) { string line; // 玩意兒 玩意儿 [wan2 yi4 r5] /erhua variant of 玩意[wan2 yi4]/ var re = new Regex(@"^([^ ]+) ([^ ]+) \[([^\]]+)\]"); using (var sr = new StreamReader(fn)) { while ((line = sr.ReadLine()) != null) { var m = re.Match(line); if (!m.Success) { continue; } string pinyinStr = m.Groups[3].Value; pinyinStr = pinyinStr.Replace("u:", "v").Replace("5", "").ToLowerInvariant(); var sylls = pinyinStr.Split(' '); bool skip = false; foreach (var syll in sylls) { skip |= !pinyin.IsNumSyllable(syll); } if (skip) { continue; } string trad = m.Groups[1].Value; string simp = m.Groups[2].Value; if (sylls.Length != trad.Length || sylls.Length == 1 || trad.Length != simp.Length) { continue; } if (!dictSimp.ContainsKey(pinyinStr)) { dictSimp[pinyinStr] = new List <string>(); } if (!dictTrad.ContainsKey(pinyinStr)) { dictTrad[pinyinStr] = new List <string>(); } dictSimp[pinyinStr].Add(simp); dictTrad[pinyinStr].Add(trad); } } }
public CharReadings(string fn, Dictionary <string, int> ranks, Pinyin pinyin, PolyDict polyDict, bool isSimp) { var rdict = new Dictionary <string, HashSet <string> >(); string line; Match m; // U+7684 kHanyuPinlu de(75596) dì(157) dí(84) // U+7684 kHanyuPinyin 42644.160:dì,dí,de // U+5730 kMandarin de dì var reHanyuPinlu = new Regex(@"U\+[^\t]+\tkHanyuPinlu\t(.+)"); var reHanyuPinyin = new Regex(@"U\+[^\t]+\tkHanyuPinyin\t(.+)"); var reMandarin = new Regex(@"U\+[^\t]+\tkMandarin\t(.+)"); using (var sr = new StreamReader(fn)) { while ((line = sr.ReadLine()) != null) { if (!line.StartsWith("U+")) { continue; } string charCode = line.Substring(2, line.IndexOf('\t') - 2); string Char = char.ConvertFromUtf32(Convert.ToInt32(charCode, 16)); if (!rdict.ContainsKey(Char)) { rdict[Char] = new HashSet <string>(); } m = reHanyuPinlu.Match(line); if (m.Success) { var parts = m.Groups[1].Value.Split(' '); foreach (var itm in parts) { rdict[Char].Add(itm.Substring(0, itm.IndexOf('('))); } continue; } m = reHanyuPinyin.Match(line); if (m.Success) { string[] vals = m.Groups[1].Value.Split(' '); foreach (string val in vals) { var parts = val.Substring(val.IndexOf(':') + 1).Split(','); foreach (var reading in parts) { rdict[Char].Add(reading); } } continue; } m = reMandarin.Match(line); if (m.Success) { // For now, we only consider characters that are on the (short-ish) frequency lists. // This can be extended later, with Unihan-based decision to separate simplified from traditional. if (!ranks.ContainsKey(Char)) { continue; } var parts = m.Groups[1].Value.Split(' '); foreach (var reading in parts) { rdict[Char].Add(reading); } continue; } } } foreach (var x in rdict) { foreach (var reading in x.Value) { var readingNums = pinyin.SurfToNums(reading); if (readingNums == null) { continue; } //if (reading == "hng" || reading == "ế" || reading == "ê" || reading == "ề" || reading == "ê" || // reading == "ê̌" || reading == "ê̄" || reading == "wòng" || reading == "dìn" || reading == "hm") continue; // Filter readings: only keep what we've seen in CEDICT if (!polyDict.HasReading(x.Key, readingNums, isSimp)) { continue; } // Remember reading CharReading cr = new CharReading { Hanzi = x.Key, Pinyin = readingNums, }; ReadingsList.Add(cr); } } // Sort by rank (more frequent ones come first) ReadingsList.Sort((a, b) => { if (ranks.ContainsKey(a.Hanzi)) { if (ranks.ContainsKey(b.Hanzi)) { return(ranks[a.Hanzi].CompareTo(ranks[b.Hanzi])); } else { return(-1); } } else { if (ranks.ContainsKey(b.Hanzi)) { return(1); } else { return(0); } } }); }
public PolyDict(string fn, Pinyin pinyin) { string line; // 玩意兒 玩意儿 [wan2 yi4 r5] /erhua variant of 玩意[wan2 yi4]/ var re = new Regex(@"^([^ ]+) ([^ ]+) \[([^\]]+)\]"); using (var sr = new StreamReader(fn)) { while ((line = sr.ReadLine()) != null) { var m = re.Match(line); if (!m.Success) { continue; } string pinyinStr = m.Groups[3].Value; pinyinStr = pinyinStr.Replace("u:", "v").Replace("5", "").ToLowerInvariant(); var sylls = pinyinStr.Split(' '); string trad = m.Groups[1].Value; string simp = m.Groups[2].Value; var usimp = new List <string>(); var utrad = new List <string>(); foreach (string chr in asUniChars(simp)) { usimp.Add(chr); } foreach (string chr in asUniChars(trad)) { utrad.Add(chr); } trad = ""; simp = ""; for (int i = 0; i < utrad.Count; ++i) { if (i != 0) { trad += ' '; } trad += utrad[i]; } for (int i = 0; i < usimp.Count; ++i) { if (i != 0) { simp += ' '; } simp += usimp[i]; } bool skip = false; skip |= (sylls.Length != utrad.Count || utrad.Count != usimp.Count); foreach (var syll in sylls) { skip |= !pinyin.IsNumSyllable(syll); } foreach (string ts in utrad) { skip |= !isHanzi(ts); } if (skip) { continue; } if (!DictSimp.ContainsKey(pinyinStr)) { DictSimp[pinyinStr] = new List <string>(); } if (!DictTrad.ContainsKey(pinyinStr)) { DictTrad[pinyinStr] = new List <string>(); } if (usimp.Count > 1) { DictSimp[pinyinStr].Add(simp); DictTrad[pinyinStr].Add(trad); } for (int i = 0; i < sylls.Length; ++i) { if (!charReadingsSimp.ContainsKey(usimp[i])) { charReadingsSimp[usimp[i]] = new List <string>(); } if (!charReadingsTrad.ContainsKey(utrad[i])) { charReadingsTrad[utrad[i]] = new List <string>(); } charReadingsSimp[usimp[i]].Add(sylls[i]); charReadingsTrad[utrad[i]].Add(sylls[i]); } } } }