public WikiPageList ReadHtmlDbFile(string filename, Dictionary <Int64, PageIndex> pageIndices) { string sql = "SELECT page_id, body FROM html"; WikiPageList pages = new WikiPageList(); System.Console.WriteLine($"Reading HTML files from {Path.GetFileName(filename)}..."); XowaParser.ExecuteSql(filename, sql, (reader) => { Int64 pageId = (Int64)reader["page_id"]; byte[] body = (byte[])reader["body"]; WikiPage page = new WikiPage() { Id = pageId }; // Decode body (GZip'd data) using (MemoryStream stream = new MemoryStream(body)) using (GZipInputStream gzipStream = new GZipInputStream(stream)) using (StreamReader streamReader = new StreamReader(gzipStream)) { page.Content = streamReader.ReadToEnd(); } // Associated and save PageIndex idx = null; if (pageIndices.TryGetValue(pageId, out idx)) { page.PrimaryTitle = idx.PrimaryTitle; page.SecondaryTitles = idx.SecondaryTitles; pages.ValidPages.Add(page); } else { page.PrimaryTitle = "<Unknown>"; page.SecondaryTitles = new List <string>(); pages.InvalidPages.Add(page); } }); System.Console.WriteLine($"Read {pages.ValidPages.Count} total valid pages and {pages.InvalidPages.Count} invalid pages."); return(pages); }
/// <summary> /// TODO: Deduplicate /// </summary> public CompressedWikiPageList ReadCompressedHtmlDbFile(string filename, Dictionary <Int64, PageIndex> pageIndices) { string sql = "SELECT page_id, body FROM html"; CompressedWikiPageList pages = new CompressedWikiPageList(); System.Console.WriteLine($"Reading HTML files from {Path.GetFileName(filename)}..."); XowaParser.ExecuteSql(filename, sql, (reader) => { Int64 pageId = (Int64)reader["page_id"]; byte[] body = (byte[])reader["body"]; CompressedWikiPage page = new CompressedWikiPage() { Id = pageId, Content = body }; // Associated and save PageIndex idx = null; if (pageIndices.TryGetValue(pageId, out idx)) { page.PrimaryTitle = idx.PrimaryTitle; page.SecondaryTitles = idx.SecondaryTitles; pages.ValidPages.Add(page); } else { page.PrimaryTitle = "<Unknown>"; page.SecondaryTitles = new List <string>(); pages.InvalidPages.Add(page); } }); System.Console.WriteLine($"Read {pages.ValidPages.Count} total valid pages and {pages.InvalidPages.Count} invalid pages."); return(pages); }