コード例 #1
0
ファイル: rawParser.cs プロジェクト: reactxx/rewise
    public static void CSWordSenses()
    {
        var fn    = WikiRawConsts.loadStat().First(f => f.lang == "cs" && f.type == WikiRawConsts.wiktionary).fileNameDump();
        var names = WikiRawConsts.csWordSenses.ToHashSet();
        var lines = new List <string>();

        JsonNew.DeserializeEnum <Sections>(fn + ".sec.json", sect => {
            if (sect.subsections == null)
            {
                return;
            }
            var cs = sect.subsections.FirstOrDefault(s => s.title == "čeština");
            if (cs == null || cs.subsections == null)
            {
                return;
            }
            var senses = cs.subsections.Select(scs => scs.title).Where(s => names.Contains(s)).Distinct().ToArray();
            if (senses.Length == 0)
            {
                return;
            }
            lines.Add($"{sect.title}={string.Join(",", senses)}");
        });
        //using (var rdr = new JsonStreamReader(fn + ".sec.json")) {
        //  foreach (var sect in rdr.Deserialize<Sections>()) {
        //    if (sect.subsections == null) continue;
        //    var cs = sect.subsections.FirstOrDefault(s => s.title == "čeština");
        //    if (cs == null || cs.subsections == null) continue;
        //    var senses = cs.subsections.Select(scs => scs.title).Where(s => names.Contains(s)).Distinct().ToArray();
        //    if (senses.Length == 0) continue;
        //    lines.Add($"{sect.title}={string.Join(",", senses)}");
        //  }
        //}
        File.WriteAllLines(fn + ".cs-senses.txt", lines.OrderBy(s => s));
    }
コード例 #2
0
ファイル: rawParser.cs プロジェクト: reactxx/rewise
    public static void SectionStats()
    {
        Parallel.ForEach(WikiRawConsts.getRawFiles(WikiRawConsts.wiktionary), new ParallelOptions {
            MaxDegreeOfParallelism = 4
        }, rf => {
            var sectStat = new Dictionary <string, int>();
            void add(string l) => sectStat[l] = sectStat.TryGetValue(l, out int c) ? c + 1 : 1;

            JsonNew.DeserializeEnum <Sections>(rf.fileNameDump() + ".sec.json", sect => {
                if (sectStat.Count > 5000)
                {
                    return;
                }
                foreach (var s in sect.lines(0, ""))
                {
                    add(s);
                }
            });
            //using (var rdr = new JsonStreamReader(rf.fileNameDump() + ".sec.json")) {
            //  foreach (var sect in rdr.Deserialize<Sections>()) {
            //    if (sectStat.Count > 5000) break;
            //    foreach (var s in sect.lines(0, "")) add(s);
            //  }
            //}
            File.WriteAllLines(rf.fileNameDump() + ".sec-stat.txt", sectStat.Where(kv => kv.Value >= 10).OrderBy(s => s.Key).Select(s => $"{s.Key} #{s.Value}"));
        });
    }
コード例 #3
0
ファイル: rawParser.cs プロジェクト: reactxx/rewise
 public static void ParseToJson()
 {
     Parallel.ForEach(WikiRawConsts.getRawFiles(WikiRawConsts.wiktionary).Where(rf => rf.pages >= wiktPageNumLimit), new ParallelOptions {
         MaxDegreeOfParallelism = 6
     }, rf => {
         //var rf = WikiRawConsts.loadStat().First(f => f.lang == "cs" && f.type == WikiRawConsts.wiktionary);
         IEnumerable <WikimediaPage> pages = new Wikimedia(rf.fileName()).Articles.Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage);
         var cnt = 0;
         JsonNew.SerializeEnum <WikimediaPage>(rf.fileNameDump() + ".parsed.json",
                                               pages.Where(p => p.Sections.FirstOrDefault(s => rf.lang != "cs" || s.SectionName.Trim().ToLower() == "čeština") != null).identityEnum(page => {
             if (cnt % 10000 == 0)
             {
                 Console.WriteLine($"{rf.lang} {cnt}");
             }
             cnt++;
             page.Text = "";
         })
                                               );
         //using (var wr = new JsonStreamWriter(rf.fileNameDump() + ".parsed.json"))
         //  foreach (var page in pages.Where(p => p.Sections.FirstOrDefault(s => rf.lang != "cs" || s.SectionName.Trim().ToLower() == "čeština") != null)) {
         //    if (cnt % 10000 == 0) Console.WriteLine($"{rf.lang} {cnt}");
         //    cnt++;
         //    page.Text = "";
         //    wr.Serialize(page);
         //  }
     });
 }
コード例 #4
0
ファイル: rawParser.cs プロジェクト: reactxx/rewise
    public static void ExtractSections()
    {
        var stat = WikiRawConsts.loadStat();

        Parallel.ForEach(WikiRawConsts.getRawFiles(WikiRawConsts.wiktionary).Where(rf => rf.pages >= wiktPageNumLimit), new ParallelOptions {
            MaxDegreeOfParallelism = 6
        }, rf => {
            IEnumerable <WikimediaPage> pages = new Wikimedia(rf.fileName()).Articles.Where(article => !article.IsDisambiguation && !article.IsRedirect && !article.IsSpecialPage);
            var cnt = 0;
            JsonNew.SerializeEnum <Sections>(rf.fileNameDump() + ".sec.json", pages.Select(p => new Sections(p)).identityEnum(sect => {
                if (cnt % 100000 == 0)
                {
                    Console.WriteLine($"{rf.lang} {cnt}");
                }
                cnt++;
            }));
            //using (var wr = new JsonStreamWriter(rf.fileNameDump() + ".sec.json"))
            //  foreach (var sect in pages.Select(p => new Sections(p))) {
            //    if (cnt % 100000 == 0) Console.WriteLine($"{rf.lang} {cnt}");
            //    cnt++;
            //    wr.Serialize(sect);
            //  }
            lock (stat) {
                stat.First(s => s.type == WikiRawConsts.wiktionary && s.lang == rf.lang).pages = cnt;
            }
        });
        WikiRawConsts.saveStat();
    }
コード例 #5
0
ファイル: UtilsTest.cs プロジェクト: reactxx/rewise
 public void Test1()
 {
     JsonNew.SerializeEnum(@"c:\temp\pom.json", objs());
     Parallel.ForEach(Enumerable.Range(0, 100), idx => {
         var count = 0;
         JsonNew.DeserializeEnum(typeof(X), @"c:\temp\pom.json", x => {
             count++;
         });
         Assert.Equal(cnt * 2, count);
         count = 0;
     });
 }
コード例 #6
0
ファイル: Wikimedia.cs プロジェクト: reactxx/rewise
        /// <summary>
        /// Writes a set of <see cref="WikimediaPage"/>s to disk in a simple binary format consisting of the article title and the plaintext contents.
        /// </summary>
        /// <param name="articles">A set of articles, probably from <see cref="ReadArticlesFromXmlDump"/></param>
        /// <param name="outputFilename">The filename into which articles should be saved</param>
        /// <returns>The number of articles written</returns>
        public static int WriteToDisk(IEnumerable <WikimediaPage> articles, string outputFilename)
        {
            var numberOfArticles = 0;

            JsonNew.SerializeEnum <WikimediaPage>(outputFilename, articles.identityEnum(a => ++ numberOfArticles));
            //using (var bh = new JsonStreamWriter(outputFilename)) {
            //  foreach (var article in articles) {
            //    //var json = JsonConvert.SerializeObject(article);

            //    bh.Serialize(article);
            //    ++numberOfArticles;
            //  }
            //}
            return(numberOfArticles);
        }
コード例 #7
0
        public int WithVector()
        {
            //Debugger.Launch();
            //Debugger.Break();

            byte[] data = _jsonData;
            _ = _jsonData.Length; // tell runtime this is not null

            int retVal = default;

            for (int i = 0; i < ITER_COUNT; i++)
            {
                retVal = JsonNew.GetIndexOfFirstByteToEncode(data);
            }
            return(retVal);
        }