public static void ExtractSections() { var stat = WikiRawConsts.loadStat(); Parallel.ForEach(WikiRawConsts.getRawFiles(WikiRawConsts.wiktionary).Where(rf => rf.pages >= wiktPageNumLimit), new ParallelOptions { MaxDegreeOfParallelism = 6 }, rf => { IEnumerable <WikimediaPage> pages = new Wikimedia(rf.fileName()).Articles.Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage); var cnt = 0; JsonNew.SerializeEnum <Sections>(rf.fileNameDump() + ".sec.json", pages.Select(p => new Sections(p)).identityEnum(sect => { if (cnt % 100000 == 0) { Console.WriteLine($"{rf.lang} {cnt}"); } cnt++; })); //using (var wr = new JsonStreamWriter(rf.fileNameDump() + ".sec.json")) // foreach (var sect in pages.Select(p => new Sections(p))) { // if (cnt % 100000 == 0) Console.WriteLine($"{rf.lang} {cnt}"); // cnt++; // wr.Serialize(sect); // } lock (stat) { stat.First(s => s.type == WikiRawConsts.wiktionary && s.lang == rf.lang).pages = cnt; } }); WikiRawConsts.saveStat(); }
public static void ParseToJson() { Parallel.ForEach(WikiRawConsts.getRawFiles(WikiRawConsts.wiktionary).Where(rf => rf.pages >= wiktPageNumLimit), new ParallelOptions { MaxDegreeOfParallelism = 6 }, rf => { //var rf = WikiRawConsts.loadStat().First(f => f.lang == "cs" && f.type == WikiRawConsts.wiktionary); IEnumerable <WikimediaPage> pages = new Wikimedia(rf.fileName()).Articles.Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage); var cnt = 0; JsonNew.SerializeEnum <WikimediaPage>(rf.fileNameDump() + ".parsed.json", pages.Where(p => p.Sections.FirstOrDefault(s => rf.lang != "cs" || s.SectionName.Trim().ToLower() == "čeština") != null).identityEnum(page => { if (cnt % 10000 == 0) { Console.WriteLine($"{rf.lang} {cnt}"); } cnt++; page.Text = ""; }) ); //using (var wr = new JsonStreamWriter(rf.fileNameDump() + ".parsed.json")) // foreach (var page in pages.Where(p => p.Sections.FirstOrDefault(s => rf.lang != "cs" || s.SectionName.Trim().ToLower() == "čeština") != null)) { // if (cnt % 10000 == 0) Console.WriteLine($"{rf.lang} {cnt}"); // cnt++; // page.Text = ""; // wr.Serialize(page); // } }); }
public void Test1() { JsonNew.SerializeEnum(@"c:\temp\pom.json", objs()); Parallel.ForEach(Enumerable.Range(0, 100), idx => { var count = 0; JsonNew.DeserializeEnum(typeof(X), @"c:\temp\pom.json", x => { count++; }); Assert.Equal(cnt * 2, count); count = 0; }); }
/// <summary> /// Writes a set of <see cref="WikimediaPage"/>s to disk in a simple binary format consisting of the article title and the plaintext contents. /// </summary> /// <param name="articles">A set of articles, probably from <see cref="ReadArticlesFromXmlDump"/></param> /// <param name="outputFilename">The filename into which articles should be saved</param> /// <returns>The number of articles written</returns> public static int WriteToDisk(IEnumerable <WikimediaPage> articles, string outputFilename) { var numberOfArticles = 0; JsonNew.SerializeEnum <WikimediaPage>(outputFilename, articles.identityEnum(a => ++ numberOfArticles)); //using (var bh = new JsonStreamWriter(outputFilename)) { // foreach (var article in articles) { // //var json = JsonConvert.SerializeObject(article); // bh.Serialize(article); // ++numberOfArticles; // } //} return(numberOfArticles); }