Exemplo n.º 1
0
        private Dictionary <Int64, PageIndex> GetPageIndices(XowaParser parser)
        {
            Dictionary <Int64, PageIndex> pageIndices;

            if (!string.IsNullOrWhiteSpace(this.IndexFile))
            {
                using (FileStream stream = File.OpenRead(this.IndexFile))
                {
                    pageIndices = Serializer.Deserialize <Dictionary <Int64, PageIndex> >(stream);
                }
            }
            else
            {
                string workingDir = Directory.GetCurrentDirectory();
                pageIndices = parser.ReadPageIndices();

                string saveFile = Path.Combine(workingDir, "pageIndices.protobin");
                using (FileStream stream = File.Create(saveFile))
                {
                    Serializer.Serialize(stream, pageIndices);
                }
                System.Console.WriteLine($"Saved the page indices to {saveFile}.");
            }

            return(pageIndices);
        }
Exemplo n.º 2
0
        public Dictionary <Int64, PageIndex> ReadPageIndices()
        {
            string coreFilename = Path.Combine(this.inputFolder, $"{this.wikiName}{XowaParser.CoreSuffix}");
            string sql          = "SELECT page_id, page_title, page_is_redirect, page_redirect_id FROM page";

            Dictionary <Int64, PageIndex> pageIndices = new Dictionary <Int64, PageIndex>();

            System.Console.WriteLine("Reading page indices...");
            XowaParser.ExecuteSql(coreFilename, sql, (reader) =>
            {
                Int64 pageId     = (Int64)reader["page_id"];
                string title     = (string)reader["page_title"];
                bool isRedirect  = (Int64)reader["page_is_redirect"] == 1;
                Int64 redirectId = (Int64)reader["page_redirect_id"];

                if (isRedirect)
                {
                    PageIndex idx;
                    if (pageIndices.TryGetValue(redirectId, out idx))
                    {
                        idx.SecondaryTitles.Add(title);
                    }
                    else
                    {
                        // New page with secondary title but no primary title
                        pageIndices.Add(redirectId, new PageIndex
                        {
                            Id              = redirectId,
                            PrimaryTitle    = "<Unset>",
                            SecondaryTitles = new List <string>
                            {
                                title
                            }
                        });
                    }
                }
                else
                {
                    PageIndex idx;
                    if (pageIndices.TryGetValue(pageId, out idx))
                    {
                        // A redirect added the main title for us.
                        idx.PrimaryTitle = title;
                    }
                    else
                    {
                        // New page with primary title and no secondary titles
                        pageIndices.Add(pageId, new PageIndex
                        {
                            Id              = pageId,
                            PrimaryTitle    = title,
                            SecondaryTitles = new List <string>()
                        });
                    }
                }
            });
            System.Console.WriteLine($"Read {pageIndices.Count} unique page indices.");

            return(pageIndices);
        }
Exemplo n.º 3
0
        public WikiPageList ReadHtmlDbFile(string filename, Dictionary <Int64, PageIndex> pageIndices)
        {
            string sql = "SELECT page_id, body FROM html";

            WikiPageList pages = new WikiPageList();

            System.Console.WriteLine($"Reading HTML files from {Path.GetFileName(filename)}...");
            XowaParser.ExecuteSql(filename, sql, (reader) =>
            {
                Int64 pageId = (Int64)reader["page_id"];
                byte[] body  = (byte[])reader["body"];

                WikiPage page = new WikiPage()
                {
                    Id = pageId
                };

                // Decode body (GZip'd data)
                using (MemoryStream stream = new MemoryStream(body))
                    using (GZipInputStream gzipStream = new GZipInputStream(stream))
                        using (StreamReader streamReader = new StreamReader(gzipStream))
                        {
                            page.Content = streamReader.ReadToEnd();
                        }

                // Associated and save
                PageIndex idx = null;
                if (pageIndices.TryGetValue(pageId, out idx))
                {
                    page.PrimaryTitle    = idx.PrimaryTitle;
                    page.SecondaryTitles = idx.SecondaryTitles;
                    pages.ValidPages.Add(page);
                }
                else
                {
                    page.PrimaryTitle    = "<Unknown>";
                    page.SecondaryTitles = new List <string>();
                    pages.InvalidPages.Add(page);
                }
            });

            System.Console.WriteLine($"Read {pages.ValidPages.Count} total valid pages and {pages.InvalidPages.Count} invalid pages.");
            return(pages);
        }
Exemplo n.º 4
0
        /// <summary>
        /// TODO: Deduplicate
        /// </summary>
        public CompressedWikiPageList ReadCompressedHtmlDbFile(string filename, Dictionary <Int64, PageIndex> pageIndices)
        {
            string sql = "SELECT page_id, body FROM html";

            CompressedWikiPageList pages = new CompressedWikiPageList();

            System.Console.WriteLine($"Reading HTML files from {Path.GetFileName(filename)}...");
            XowaParser.ExecuteSql(filename, sql, (reader) =>
            {
                Int64 pageId = (Int64)reader["page_id"];
                byte[] body  = (byte[])reader["body"];

                CompressedWikiPage page = new CompressedWikiPage()
                {
                    Id      = pageId,
                    Content = body
                };

                // Associated and save
                PageIndex idx = null;
                if (pageIndices.TryGetValue(pageId, out idx))
                {
                    page.PrimaryTitle    = idx.PrimaryTitle;
                    page.SecondaryTitles = idx.SecondaryTitles;
                    pages.ValidPages.Add(page);
                }
                else
                {
                    page.PrimaryTitle    = "<Unknown>";
                    page.SecondaryTitles = new List <string>();
                    pages.InvalidPages.Add(page);
                }
            });

            System.Console.WriteLine($"Read {pages.ValidPages.Count} total valid pages and {pages.InvalidPages.Count} invalid pages.");
            return(pages);
        }
Exemplo n.º 5
0
        internal int Extract()
        {
            if (this.Compress)
            {
                System.Console.WriteLine("Skipping compression...");
                Thread.Sleep(1000);
            }

            XowaParser parser = new XowaParser(this.InputFolder);
            Dictionary <Int64, PageIndex> pageIndices = this.GetPageIndices(parser);

            int           fileIdx   = 0;
            List <string> htmlFiles = parser.GetHtmlDbFileNames();

            foreach (string file in htmlFiles)
            {
                // We output a series of files to avoid (a) too many small files (b) too large of a single file.
                if (this.Compress)
                {
                    CompressedWikiPageList pages = parser.ReadCompressedHtmlDbFile(file, pageIndices);

                    using (FileStream stream = File.Create(Path.Combine(this.OutputFolder, $"html-{fileIdx}.protobin")))
                    {
                        Serializer.Serialize(stream, pages.ValidPages);
                    }

                    string invalidFolder = Path.Combine(this.OutputFolder, "invalid");
                    if (!Directory.Exists(invalidFolder) && pages.InvalidPages.Any())
                    {
                        Directory.CreateDirectory(invalidFolder);
                    }
                    else if (pages.InvalidPages.Any())
                    {
                        using (FileStream stream = File.Create(Path.Combine(invalidFolder, $"html-invalid-{fileIdx}.protobin")))
                        {
                            Serializer.Serialize(stream, pages.InvalidPages);
                        }
                    }
                }
                else
                {
                    WikiPageList pages = parser.ReadHtmlDbFile(file, pageIndices);

                    using (FileStream stream = File.Create(Path.Combine(this.OutputFolder, $"html-{fileIdx}.protobin")))
                    {
                        Serializer.Serialize(stream, pages.ValidPages);
                    }

                    string invalidFolder = Path.Combine(this.OutputFolder, "invalid");
                    if (!Directory.Exists(invalidFolder) && pages.InvalidPages.Any())
                    {
                        Directory.CreateDirectory(invalidFolder);
                    }
                    else if (pages.InvalidPages.Any())
                    {
                        using (FileStream stream = File.Create(Path.Combine(invalidFolder, $"html-invalid-{fileIdx}.protobin")))
                        {
                            Serializer.Serialize(stream, pages.InvalidPages);
                        }
                    }
                }

                ++fileIdx;
            }

            return(0);
        }