Exemplo n.º 1
0
    public static void CSWordSenses()
    {
        var fn    = WikiRawConsts.loadStat().First(f => f.lang == "cs" && f.type == WikiRawConsts.wiktionary).fileNameDump();
        var names = WikiRawConsts.csWordSenses.ToHashSet();
        var lines = new List <string>();

        JsonNew.DeserializeEnum <Sections>(fn + ".sec.json", sect => {
            if (sect.subsections == null)
            {
                return;
            }
            var cs = sect.subsections.FirstOrDefault(s => s.title == "čeština");
            if (cs == null || cs.subsections == null)
            {
                return;
            }
            var senses = cs.subsections.Select(scs => scs.title).Where(s => names.Contains(s)).Distinct().ToArray();
            if (senses.Length == 0)
            {
                return;
            }
            lines.Add($"{sect.title}={string.Join(",", senses)}");
        });
        //using (var rdr = new JsonStreamReader(fn + ".sec.json")) {
        //  foreach (var sect in rdr.Deserialize<Sections>()) {
        //    if (sect.subsections == null) continue;
        //    var cs = sect.subsections.FirstOrDefault(s => s.title == "čeština");
        //    if (cs == null || cs.subsections == null) continue;
        //    var senses = cs.subsections.Select(scs => scs.title).Where(s => names.Contains(s)).Distinct().ToArray();
        //    if (senses.Length == 0) continue;
        //    lines.Add($"{sect.title}={string.Join(",", senses)}");
        //  }
        //}
        File.WriteAllLines(fn + ".cs-senses.txt", lines.OrderBy(s => s));
    }
Exemplo n.º 2
0
    public static void ExtractSections()
    {
        var stat = WikiRawConsts.loadStat();

        Parallel.ForEach(WikiRawConsts.getRawFiles(WikiRawConsts.wiktionary).Where(rf => rf.pages >= wiktPageNumLimit), new ParallelOptions {
            MaxDegreeOfParallelism = 6
        }, rf => {
            IEnumerable <WikimediaPage> pages = new Wikimedia(rf.fileName()).Articles.Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage);
            var cnt = 0;
            JsonNew.SerializeEnum <Sections>(rf.fileNameDump() + ".sec.json", pages.Select(p => new Sections(p)).identityEnum(sect => {
                if (cnt % 100000 == 0)
                {
                    Console.WriteLine($"{rf.lang} {cnt}");
                }
                cnt++;
            }));
            //using (var wr = new JsonStreamWriter(rf.fileNameDump() + ".sec.json"))
            //  foreach (var sect in pages.Select(p => new Sections(p))) {
            //    if (cnt % 100000 == 0) Console.WriteLine($"{rf.lang} {cnt}");
            //    cnt++;
            //    wr.Serialize(sect);
            //  }
            lock (stat) {
                stat.First(s => s.type == WikiRawConsts.wiktionary && s.lang == rf.lang).pages = cnt;
            }
        });
        WikiRawConsts.saveStat();
    }