private Dictionary <Int64, PageIndex> GetPageIndices(XowaParser parser) { Dictionary <Int64, PageIndex> pageIndices; if (!string.IsNullOrWhiteSpace(this.IndexFile)) { using (FileStream stream = File.OpenRead(this.IndexFile)) { pageIndices = Serializer.Deserialize <Dictionary <Int64, PageIndex> >(stream); } } else { string workingDir = Directory.GetCurrentDirectory(); pageIndices = parser.ReadPageIndices(); string saveFile = Path.Combine(workingDir, "pageIndices.protobin"); using (FileStream stream = File.Create(saveFile)) { Serializer.Serialize(stream, pageIndices); } System.Console.WriteLine($"Saved the page indices to {saveFile}."); } return(pageIndices); }
public Dictionary <Int64, PageIndex> ReadPageIndices() { string coreFilename = Path.Combine(this.inputFolder, $"{this.wikiName}{XowaParser.CoreSuffix}"); string sql = "SELECT page_id, page_title, page_is_redirect, page_redirect_id FROM page"; Dictionary <Int64, PageIndex> pageIndices = new Dictionary <Int64, PageIndex>(); System.Console.WriteLine("Reading page indices..."); XowaParser.ExecuteSql(coreFilename, sql, (reader) => { Int64 pageId = (Int64)reader["page_id"]; string title = (string)reader["page_title"]; bool isRedirect = (Int64)reader["page_is_redirect"] == 1; Int64 redirectId = (Int64)reader["page_redirect_id"]; if (isRedirect) { PageIndex idx; if (pageIndices.TryGetValue(redirectId, out idx)) { idx.SecondaryTitles.Add(title); } else { // New page with secondary title but no primary title pageIndices.Add(redirectId, new PageIndex { Id = redirectId, PrimaryTitle = "<Unset>", SecondaryTitles = new List <string> { title } }); } } else { PageIndex idx; if (pageIndices.TryGetValue(pageId, out idx)) { // A redirect added the main title for us. idx.PrimaryTitle = title; } else { // New page with primary title and no secondary titles pageIndices.Add(pageId, new PageIndex { Id = pageId, PrimaryTitle = title, SecondaryTitles = new List <string>() }); } } }); System.Console.WriteLine($"Read {pageIndices.Count} unique page indices."); return(pageIndices); }
public WikiPageList ReadHtmlDbFile(string filename, Dictionary <Int64, PageIndex> pageIndices) { string sql = "SELECT page_id, body FROM html"; WikiPageList pages = new WikiPageList(); System.Console.WriteLine($"Reading HTML files from {Path.GetFileName(filename)}..."); XowaParser.ExecuteSql(filename, sql, (reader) => { Int64 pageId = (Int64)reader["page_id"]; byte[] body = (byte[])reader["body"]; WikiPage page = new WikiPage() { Id = pageId }; // Decode body (GZip'd data) using (MemoryStream stream = new MemoryStream(body)) using (GZipInputStream gzipStream = new GZipInputStream(stream)) using (StreamReader streamReader = new StreamReader(gzipStream)) { page.Content = streamReader.ReadToEnd(); } // Associated and save PageIndex idx = null; if (pageIndices.TryGetValue(pageId, out idx)) { page.PrimaryTitle = idx.PrimaryTitle; page.SecondaryTitles = idx.SecondaryTitles; pages.ValidPages.Add(page); } else { page.PrimaryTitle = "<Unknown>"; page.SecondaryTitles = new List <string>(); pages.InvalidPages.Add(page); } }); System.Console.WriteLine($"Read {pages.ValidPages.Count} total valid pages and {pages.InvalidPages.Count} invalid pages."); return(pages); }
/// <summary> /// TODO: Deduplicate /// </summary> public CompressedWikiPageList ReadCompressedHtmlDbFile(string filename, Dictionary <Int64, PageIndex> pageIndices) { string sql = "SELECT page_id, body FROM html"; CompressedWikiPageList pages = new CompressedWikiPageList(); System.Console.WriteLine($"Reading HTML files from {Path.GetFileName(filename)}..."); XowaParser.ExecuteSql(filename, sql, (reader) => { Int64 pageId = (Int64)reader["page_id"]; byte[] body = (byte[])reader["body"]; CompressedWikiPage page = new CompressedWikiPage() { Id = pageId, Content = body }; // Associated and save PageIndex idx = null; if (pageIndices.TryGetValue(pageId, out idx)) { page.PrimaryTitle = idx.PrimaryTitle; page.SecondaryTitles = idx.SecondaryTitles; pages.ValidPages.Add(page); } else { page.PrimaryTitle = "<Unknown>"; page.SecondaryTitles = new List <string>(); pages.InvalidPages.Add(page); } }); System.Console.WriteLine($"Read {pages.ValidPages.Count} total valid pages and {pages.InvalidPages.Count} invalid pages."); return(pages); }
internal int Extract() { if (this.Compress) { System.Console.WriteLine("Skipping compression..."); Thread.Sleep(1000); } XowaParser parser = new XowaParser(this.InputFolder); Dictionary <Int64, PageIndex> pageIndices = this.GetPageIndices(parser); int fileIdx = 0; List <string> htmlFiles = parser.GetHtmlDbFileNames(); foreach (string file in htmlFiles) { // We output a series of files to avoid (a) too many small files (b) too large of a single file. if (this.Compress) { CompressedWikiPageList pages = parser.ReadCompressedHtmlDbFile(file, pageIndices); using (FileStream stream = File.Create(Path.Combine(this.OutputFolder, $"html-{fileIdx}.protobin"))) { Serializer.Serialize(stream, pages.ValidPages); } string invalidFolder = Path.Combine(this.OutputFolder, "invalid"); if (!Directory.Exists(invalidFolder) && pages.InvalidPages.Any()) { Directory.CreateDirectory(invalidFolder); } else if (pages.InvalidPages.Any()) { using (FileStream stream = File.Create(Path.Combine(invalidFolder, $"html-invalid-{fileIdx}.protobin"))) { Serializer.Serialize(stream, pages.InvalidPages); } } } else { WikiPageList pages = parser.ReadHtmlDbFile(file, pageIndices); using (FileStream stream = File.Create(Path.Combine(this.OutputFolder, $"html-{fileIdx}.protobin"))) { Serializer.Serialize(stream, pages.ValidPages); } string invalidFolder = Path.Combine(this.OutputFolder, "invalid"); if (!Directory.Exists(invalidFolder) && pages.InvalidPages.Any()) { Directory.CreateDirectory(invalidFolder); } else if (pages.InvalidPages.Any()) { using (FileStream stream = File.Create(Path.Combine(invalidFolder, $"html-invalid-{fileIdx}.protobin"))) { Serializer.Serialize(stream, pages.InvalidPages); } } } ++fileIdx; } return(0); }