Пример #1
0
        public WikiPageList ReadHtmlDbFile(string filename, Dictionary <Int64, PageIndex> pageIndices)
        {
            string sql = "SELECT page_id, body FROM html";

            WikiPageList pages = new WikiPageList();

            System.Console.WriteLine($"Reading HTML files from {Path.GetFileName(filename)}...");
            XowaParser.ExecuteSql(filename, sql, (reader) =>
            {
                Int64 pageId = (Int64)reader["page_id"];
                byte[] body  = (byte[])reader["body"];

                WikiPage page = new WikiPage()
                {
                    Id = pageId
                };

                // Decode body (GZip'd data)
                using (MemoryStream stream = new MemoryStream(body))
                    using (GZipInputStream gzipStream = new GZipInputStream(stream))
                        using (StreamReader streamReader = new StreamReader(gzipStream))
                        {
                            page.Content = streamReader.ReadToEnd();
                        }

                // Associated and save
                PageIndex idx = null;
                if (pageIndices.TryGetValue(pageId, out idx))
                {
                    page.PrimaryTitle    = idx.PrimaryTitle;
                    page.SecondaryTitles = idx.SecondaryTitles;
                    pages.ValidPages.Add(page);
                }
                else
                {
                    page.PrimaryTitle    = "<Unknown>";
                    page.SecondaryTitles = new List <string>();
                    pages.InvalidPages.Add(page);
                }
            });

            System.Console.WriteLine($"Read {pages.ValidPages.Count} total valid pages and {pages.InvalidPages.Count} invalid pages.");
            return(pages);
        }
Пример #2
0
        /// <summary>
        /// TODO: Deduplicate
        /// </summary>
        public CompressedWikiPageList ReadCompressedHtmlDbFile(string filename, Dictionary <Int64, PageIndex> pageIndices)
        {
            string sql = "SELECT page_id, body FROM html";

            CompressedWikiPageList pages = new CompressedWikiPageList();

            System.Console.WriteLine($"Reading HTML files from {Path.GetFileName(filename)}...");
            XowaParser.ExecuteSql(filename, sql, (reader) =>
            {
                Int64 pageId = (Int64)reader["page_id"];
                byte[] body  = (byte[])reader["body"];

                CompressedWikiPage page = new CompressedWikiPage()
                {
                    Id      = pageId,
                    Content = body
                };

                // Associated and save
                PageIndex idx = null;
                if (pageIndices.TryGetValue(pageId, out idx))
                {
                    page.PrimaryTitle    = idx.PrimaryTitle;
                    page.SecondaryTitles = idx.SecondaryTitles;
                    pages.ValidPages.Add(page);
                }
                else
                {
                    page.PrimaryTitle    = "<Unknown>";
                    page.SecondaryTitles = new List <string>();
                    pages.InvalidPages.Add(page);
                }
            });

            System.Console.WriteLine($"Read {pages.ValidPages.Count} total valid pages and {pages.InvalidPages.Count} invalid pages.");
            return(pages);
        }