Exemple #1
0
 internal void ProcessPage(CompressedWikiPage page)
 {
     using (MemoryStream stream = new MemoryStream(page.Content))
         using (GZipInputStream gzipStream = new GZipInputStream(stream))
             using (StreamReader streamReader = new StreamReader(gzipStream))
             {
                 this.ProcessPage(new WikiPage()
                 {
                     Id              = page.Id,
                     PrimaryTitle    = page.PrimaryTitle,
                     SecondaryTitles = page.SecondaryTitles,
                     Content         = streamReader.ReadToEnd()
                 });
             }
 }
Exemple #2
0
        /// <summary>
        /// TODO: Deduplicate
        /// </summary>
        public CompressedWikiPageList ReadCompressedHtmlDbFile(string filename, Dictionary <Int64, PageIndex> pageIndices)
        {
            string sql = "SELECT page_id, body FROM html";

            CompressedWikiPageList pages = new CompressedWikiPageList();

            System.Console.WriteLine($"Reading HTML files from {Path.GetFileName(filename)}...");
            XowaParser.ExecuteSql(filename, sql, (reader) =>
            {
                Int64 pageId = (Int64)reader["page_id"];
                byte[] body  = (byte[])reader["body"];

                CompressedWikiPage page = new CompressedWikiPage()
                {
                    Id      = pageId,
                    Content = body
                };

                // Associated and save
                PageIndex idx = null;
                if (pageIndices.TryGetValue(pageId, out idx))
                {
                    page.PrimaryTitle    = idx.PrimaryTitle;
                    page.SecondaryTitles = idx.SecondaryTitles;
                    pages.ValidPages.Add(page);
                }
                else
                {
                    page.PrimaryTitle    = "<Unknown>";
                    page.SecondaryTitles = new List <string>();
                    pages.InvalidPages.Add(page);
                }
            });

            System.Console.WriteLine($"Read {pages.ValidPages.Count} total valid pages and {pages.InvalidPages.Count} invalid pages.");
            return(pages);
        }