Exemple #1
0
        /// <summary>
        /// TODO: Deduplicate
        /// </summary>
        public CompressedWikiPageList ReadCompressedHtmlDbFile(string filename, Dictionary <Int64, PageIndex> pageIndices)
        {
            string sql = "SELECT page_id, body FROM html";

            CompressedWikiPageList pages = new CompressedWikiPageList();

            System.Console.WriteLine($"Reading HTML files from {Path.GetFileName(filename)}...");
            XowaParser.ExecuteSql(filename, sql, (reader) =>
            {
                Int64 pageId = (Int64)reader["page_id"];
                byte[] body  = (byte[])reader["body"];

                CompressedWikiPage page = new CompressedWikiPage()
                {
                    Id      = pageId,
                    Content = body
                };

                // Associated and save
                PageIndex idx = null;
                if (pageIndices.TryGetValue(pageId, out idx))
                {
                    page.PrimaryTitle    = idx.PrimaryTitle;
                    page.SecondaryTitles = idx.SecondaryTitles;
                    pages.ValidPages.Add(page);
                }
                else
                {
                    page.PrimaryTitle    = "<Unknown>";
                    page.SecondaryTitles = new List <string>();
                    pages.InvalidPages.Add(page);
                }
            });

            System.Console.WriteLine($"Read {pages.ValidPages.Count} total valid pages and {pages.InvalidPages.Count} invalid pages.");
            return(pages);
        }
Exemple #2
0
        internal int Extract()
        {
            if (this.Compress)
            {
                System.Console.WriteLine("Skipping compression...");
                Thread.Sleep(1000);
            }

            XowaParser parser = new XowaParser(this.InputFolder);
            Dictionary <Int64, PageIndex> pageIndices = this.GetPageIndices(parser);

            int           fileIdx   = 0;
            List <string> htmlFiles = parser.GetHtmlDbFileNames();

            foreach (string file in htmlFiles)
            {
                // We output a series of files to avoid (a) too many small files (b) too large of a single file.
                if (this.Compress)
                {
                    CompressedWikiPageList pages = parser.ReadCompressedHtmlDbFile(file, pageIndices);

                    using (FileStream stream = File.Create(Path.Combine(this.OutputFolder, $"html-{fileIdx}.protobin")))
                    {
                        Serializer.Serialize(stream, pages.ValidPages);
                    }

                    string invalidFolder = Path.Combine(this.OutputFolder, "invalid");
                    if (!Directory.Exists(invalidFolder) && pages.InvalidPages.Any())
                    {
                        Directory.CreateDirectory(invalidFolder);
                    }
                    else if (pages.InvalidPages.Any())
                    {
                        using (FileStream stream = File.Create(Path.Combine(invalidFolder, $"html-invalid-{fileIdx}.protobin")))
                        {
                            Serializer.Serialize(stream, pages.InvalidPages);
                        }
                    }
                }
                else
                {
                    WikiPageList pages = parser.ReadHtmlDbFile(file, pageIndices);

                    using (FileStream stream = File.Create(Path.Combine(this.OutputFolder, $"html-{fileIdx}.protobin")))
                    {
                        Serializer.Serialize(stream, pages.ValidPages);
                    }

                    string invalidFolder = Path.Combine(this.OutputFolder, "invalid");
                    if (!Directory.Exists(invalidFolder) && pages.InvalidPages.Any())
                    {
                        Directory.CreateDirectory(invalidFolder);
                    }
                    else if (pages.InvalidPages.Any())
                    {
                        using (FileStream stream = File.Create(Path.Combine(invalidFolder, $"html-invalid-{fileIdx}.protobin")))
                        {
                            Serializer.Serialize(stream, pages.InvalidPages);
                        }
                    }
                }

                ++fileIdx;
            }

            return(0);
        }