public static void CSWordSenses() { var fn = WikiRawConsts.loadStat().First(f => f.lang == "cs" && f.type == WikiRawConsts.wiktionary).fileNameDump(); var names = WikiRawConsts.csWordSenses.ToHashSet(); var lines = new List <string>(); JsonNew.DeserializeEnum <Sections>(fn + ".sec.json", sect => { if (sect.subsections == null) { return; } var cs = sect.subsections.FirstOrDefault(s => s.title == "čeština"); if (cs == null || cs.subsections == null) { return; } var senses = cs.subsections.Select(scs => scs.title).Where(s => names.Contains(s)).Distinct().ToArray(); if (senses.Length == 0) { return; } lines.Add($"{sect.title}={string.Join(",", senses)}"); }); //using (var rdr = new JsonStreamReader(fn + ".sec.json")) { // foreach (var sect in rdr.Deserialize<Sections>()) { // if (sect.subsections == null) continue; // var cs = sect.subsections.FirstOrDefault(s => s.title == "čeština"); // if (cs == null || cs.subsections == null) continue; // var senses = cs.subsections.Select(scs => scs.title).Where(s => names.Contains(s)).Distinct().ToArray(); // if (senses.Length == 0) continue; // lines.Add($"{sect.title}={string.Join(",", senses)}"); // } //} File.WriteAllLines(fn + ".cs-senses.txt", lines.OrderBy(s => s)); }
public static void ParseToJson() { Parallel.ForEach(WikiRawConsts.getRawFiles(WikiRawConsts.wiktionary).Where(rf => rf.pages >= wiktPageNumLimit), new ParallelOptions { MaxDegreeOfParallelism = 6 }, rf => { //var rf = WikiRawConsts.loadStat().First(f => f.lang == "cs" && f.type == WikiRawConsts.wiktionary); IEnumerable <WikimediaPage> pages = new Wikimedia(rf.fileName()).Articles.Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage); var cnt = 0; JsonNew.SerializeEnum <WikimediaPage>(rf.fileNameDump() + ".parsed.json", pages.Where(p => p.Sections.FirstOrDefault(s => rf.lang != "cs" || s.SectionName.Trim().ToLower() == "čeština") != null).identityEnum(page => { if (cnt % 10000 == 0) { Console.WriteLine($"{rf.lang} {cnt}"); } cnt++; page.Text = ""; }) ); //using (var wr = new JsonStreamWriter(rf.fileNameDump() + ".parsed.json")) // foreach (var page in pages.Where(p => p.Sections.FirstOrDefault(s => rf.lang != "cs" || s.SectionName.Trim().ToLower() == "čeština") != null)) { // if (cnt % 10000 == 0) Console.WriteLine($"{rf.lang} {cnt}"); // cnt++; // page.Text = ""; // wr.Serialize(page); // } }); }
public static void SectionStats() { Parallel.ForEach(WikiRawConsts.getRawFiles(WikiRawConsts.wiktionary), new ParallelOptions { MaxDegreeOfParallelism = 4 }, rf => { var sectStat = new Dictionary <string, int>(); void add(string l) => sectStat[l] = sectStat.TryGetValue(l, out int c) ? c + 1 : 1; JsonNew.DeserializeEnum <Sections>(rf.fileNameDump() + ".sec.json", sect => { if (sectStat.Count > 5000) { return; } foreach (var s in sect.lines(0, "")) { add(s); } }); //using (var rdr = new JsonStreamReader(rf.fileNameDump() + ".sec.json")) { // foreach (var sect in rdr.Deserialize<Sections>()) { // if (sectStat.Count > 5000) break; // foreach (var s in sect.lines(0, "")) add(s); // } //} File.WriteAllLines(rf.fileNameDump() + ".sec-stat.txt", sectStat.Where(kv => kv.Value >= 10).OrderBy(s => s.Key).Select(s => $"{s.Key} #{s.Value}")); }); }
public static void ExtractSections() { var stat = WikiRawConsts.loadStat(); Parallel.ForEach(WikiRawConsts.getRawFiles(WikiRawConsts.wiktionary).Where(rf => rf.pages >= wiktPageNumLimit), new ParallelOptions { MaxDegreeOfParallelism = 6 }, rf => { IEnumerable <WikimediaPage> pages = new Wikimedia(rf.fileName()).Articles.Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage); var cnt = 0; JsonNew.SerializeEnum <Sections>(rf.fileNameDump() + ".sec.json", pages.Select(p => new Sections(p)).identityEnum(sect => { if (cnt % 100000 == 0) { Console.WriteLine($"{rf.lang} {cnt}"); } cnt++; })); //using (var wr = new JsonStreamWriter(rf.fileNameDump() + ".sec.json")) // foreach (var sect in pages.Select(p => new Sections(p))) { // if (cnt % 100000 == 0) Console.WriteLine($"{rf.lang} {cnt}"); // cnt++; // wr.Serialize(sect); // } lock (stat) { stat.First(s => s.type == WikiRawConsts.wiktionary && s.lang == rf.lang).pages = cnt; } }); WikiRawConsts.saveStat(); }