Ejemplo n.º 1
0
 public Composer(string sourcesFolder)
 {
     readRanks(Path.Combine(sourcesFolder, "junda-freq.txt"), true);
     readRanks(Path.Combine(sourcesFolder, "tsai-freq.txt"), false);
     pinyin           = new Pinyin(Path.Combine(sourcesFolder, "pinyin.txt"));
     charReadingsSimp = new CharReadings(Path.Combine(sourcesFolder, "Unihan_Readings.txt"), ranksSimp, pinyin);
     charReadingsTrad = new CharReadings(Path.Combine(sourcesFolder, "Unihan_Readings.txt"), ranksTrad, pinyin);
     polyDict         = new PolyDict(Path.Combine(sourcesFolder, "cedict.u8"), pinyin);
 }
Ejemplo n.º 2
0
        public Composer(string sourcesFolder)
        {
            pinyin = new Pinyin(Path.Combine(sourcesFolder, "pinyin.txt"));
            JsonSerializer ser = new JsonSerializer();

            using (StreamReader sr = new StreamReader(Path.Combine("wwwroot", "simp-map.json")))
            {
                readingsSimp = ser.Deserialize(sr, typeof(List <CharReading>)) as List <CharReading>;
            }
            using (StreamReader sr = new StreamReader(Path.Combine("wwwroot", "trad-map.json")))
            {
                readingsTrad = ser.Deserialize(sr, typeof(List <CharReading>)) as List <CharReading>;
            }
            //addPunctReadings(readingsSimp, true);
            //addPunctReadings(readingsTrad, false);
        }
Ejemplo n.º 3
0
        public PolyDict(string fn, Pinyin pinyin)
        {
            string line;
            // 玩意兒 玩意儿 [wan2 yi4 r5] /erhua variant of 玩意[wan2 yi4]/
            var re = new Regex(@"^([^ ]+) ([^ ]+) \[([^\]]+)\]");

            using (var sr = new StreamReader(fn))
            {
                while ((line = sr.ReadLine()) != null)
                {
                    var m = re.Match(line);
                    if (!m.Success)
                    {
                        continue;
                    }
                    string pinyinStr = m.Groups[3].Value;
                    pinyinStr = pinyinStr.Replace("u:", "v").Replace("5", "").ToLowerInvariant();
                    var  sylls = pinyinStr.Split(' ');
                    bool skip  = false;
                    foreach (var syll in sylls)
                    {
                        skip |= !pinyin.IsNumSyllable(syll);
                    }
                    if (skip)
                    {
                        continue;
                    }
                    string trad = m.Groups[1].Value;
                    string simp = m.Groups[2].Value;
                    if (sylls.Length != trad.Length || sylls.Length == 1 || trad.Length != simp.Length)
                    {
                        continue;
                    }
                    if (!dictSimp.ContainsKey(pinyinStr))
                    {
                        dictSimp[pinyinStr] = new List <string>();
                    }
                    if (!dictTrad.ContainsKey(pinyinStr))
                    {
                        dictTrad[pinyinStr] = new List <string>();
                    }
                    dictSimp[pinyinStr].Add(simp);
                    dictTrad[pinyinStr].Add(trad);
                }
            }
        }
Ejemplo n.º 4
0
        public CharReadings(string fn, Dictionary <string, int> ranks, Pinyin pinyin, PolyDict polyDict, bool isSimp)
        {
            var    rdict = new Dictionary <string, HashSet <string> >();
            string line;
            Match  m;
            // U+7684	kHanyuPinlu	de(75596) dì(157) dí(84)
            // U+7684	kHanyuPinyin	42644.160:dì,dí,de
            // U+5730	kMandarin	de dì
            var reHanyuPinlu  = new Regex(@"U\+[^\t]+\tkHanyuPinlu\t(.+)");
            var reHanyuPinyin = new Regex(@"U\+[^\t]+\tkHanyuPinyin\t(.+)");
            var reMandarin    = new Regex(@"U\+[^\t]+\tkMandarin\t(.+)");

            using (var sr = new StreamReader(fn))
            {
                while ((line = sr.ReadLine()) != null)
                {
                    if (!line.StartsWith("U+"))
                    {
                        continue;
                    }
                    string charCode = line.Substring(2, line.IndexOf('\t') - 2);
                    string Char     = char.ConvertFromUtf32(Convert.ToInt32(charCode, 16));
                    if (!rdict.ContainsKey(Char))
                    {
                        rdict[Char] = new HashSet <string>();
                    }
                    m = reHanyuPinlu.Match(line);
                    if (m.Success)
                    {
                        var parts = m.Groups[1].Value.Split(' ');
                        foreach (var itm in parts)
                        {
                            rdict[Char].Add(itm.Substring(0, itm.IndexOf('(')));
                        }
                        continue;
                    }
                    m = reHanyuPinyin.Match(line);
                    if (m.Success)
                    {
                        string[] vals = m.Groups[1].Value.Split(' ');
                        foreach (string val in vals)
                        {
                            var parts = val.Substring(val.IndexOf(':') + 1).Split(',');
                            foreach (var reading in parts)
                            {
                                rdict[Char].Add(reading);
                            }
                        }
                        continue;
                    }
                    m = reMandarin.Match(line);
                    if (m.Success)
                    {
                        // For now, we only consider characters that are on the (short-ish) frequency lists.
                        // This can be extended later, with Unihan-based decision to separate simplified from traditional.
                        if (!ranks.ContainsKey(Char))
                        {
                            continue;
                        }
                        var parts = m.Groups[1].Value.Split(' ');
                        foreach (var reading in parts)
                        {
                            rdict[Char].Add(reading);
                        }
                        continue;
                    }
                }
            }
            foreach (var x in rdict)
            {
                foreach (var reading in x.Value)
                {
                    var readingNums = pinyin.SurfToNums(reading);
                    if (readingNums == null)
                    {
                        continue;
                    }
                    //if (reading == "hng" || reading == "ế" || reading == "ê" || reading == "ề" || reading == "ê" ||
                    //     reading == "ê̌" || reading == "ê̄" || reading == "wòng" || reading == "dìn" || reading == "hm") continue;
                    // Filter readings: only keep what we've seen in CEDICT
                    if (!polyDict.HasReading(x.Key, readingNums, isSimp))
                    {
                        continue;
                    }
                    // Remember reading
                    CharReading cr = new CharReading
                    {
                        Hanzi  = x.Key,
                        Pinyin = readingNums,
                    };
                    ReadingsList.Add(cr);
                }
            }
            // Sort by rank (more frequent ones come first)
            ReadingsList.Sort((a, b) =>
            {
                if (ranks.ContainsKey(a.Hanzi))
                {
                    if (ranks.ContainsKey(b.Hanzi))
                    {
                        return(ranks[a.Hanzi].CompareTo(ranks[b.Hanzi]));
                    }
                    else
                    {
                        return(-1);
                    }
                }
                else
                {
                    if (ranks.ContainsKey(b.Hanzi))
                    {
                        return(1);
                    }
                    else
                    {
                        return(0);
                    }
                }
            });
        }
Ejemplo n.º 5
0
        public PolyDict(string fn, Pinyin pinyin)
        {
            string line;
            // 玩意兒 玩意儿 [wan2 yi4 r5] /erhua variant of 玩意[wan2 yi4]/
            var re = new Regex(@"^([^ ]+) ([^ ]+) \[([^\]]+)\]");

            using (var sr = new StreamReader(fn))
            {
                while ((line = sr.ReadLine()) != null)
                {
                    var m = re.Match(line);
                    if (!m.Success)
                    {
                        continue;
                    }
                    string pinyinStr = m.Groups[3].Value;
                    pinyinStr = pinyinStr.Replace("u:", "v").Replace("5", "").ToLowerInvariant();
                    var    sylls = pinyinStr.Split(' ');
                    string trad  = m.Groups[1].Value;
                    string simp  = m.Groups[2].Value;
                    var    usimp = new List <string>();
                    var    utrad = new List <string>();
                    foreach (string chr in asUniChars(simp))
                    {
                        usimp.Add(chr);
                    }
                    foreach (string chr in asUniChars(trad))
                    {
                        utrad.Add(chr);
                    }

                    trad = "";
                    simp = "";
                    for (int i = 0; i < utrad.Count; ++i)
                    {
                        if (i != 0)
                        {
                            trad += ' ';
                        }
                        trad += utrad[i];
                    }
                    for (int i = 0; i < usimp.Count; ++i)
                    {
                        if (i != 0)
                        {
                            simp += ' ';
                        }
                        simp += usimp[i];
                    }

                    bool skip = false;
                    skip |= (sylls.Length != utrad.Count || utrad.Count != usimp.Count);
                    foreach (var syll in sylls)
                    {
                        skip |= !pinyin.IsNumSyllable(syll);
                    }
                    foreach (string ts in utrad)
                    {
                        skip |= !isHanzi(ts);
                    }
                    if (skip)
                    {
                        continue;
                    }

                    if (!DictSimp.ContainsKey(pinyinStr))
                    {
                        DictSimp[pinyinStr] = new List <string>();
                    }
                    if (!DictTrad.ContainsKey(pinyinStr))
                    {
                        DictTrad[pinyinStr] = new List <string>();
                    }
                    if (usimp.Count > 1)
                    {
                        DictSimp[pinyinStr].Add(simp);
                        DictTrad[pinyinStr].Add(trad);
                    }
                    for (int i = 0; i < sylls.Length; ++i)
                    {
                        if (!charReadingsSimp.ContainsKey(usimp[i]))
                        {
                            charReadingsSimp[usimp[i]] = new List <string>();
                        }
                        if (!charReadingsTrad.ContainsKey(utrad[i]))
                        {
                            charReadingsTrad[utrad[i]] = new List <string>();
                        }
                        charReadingsSimp[usimp[i]].Add(sylls[i]);
                        charReadingsTrad[utrad[i]].Add(sylls[i]);
                    }
                }
            }
        }