Example #1
0
        public void WikipediaReaderTest()
        {
            var xmlDump = GetXmlDump();

            using var xmlReader = new WikiDumpXmlReader(xmlDump);
            var wikiReader = new WikipediaReader(
                xmlReader,
                WikipediaReader.DefaultFilter,
                blockSize: 1);
            var corpus = wikiReader.Read().ToArray();

            Assert.Equal(2, corpus.Length);

            Assert.Equal(1, corpus[0].Documents.Count);
            Assert.Equal(1, corpus[0].Metadata.Count);
            var doc1 = corpus[0].Documents[0];

            Assert.Equal("Simple page", doc1.Metadata.Title);
            Assert.Equal("=Simple Page=\nSome text", doc1.Data);
            Assert.Equal("Simple page", corpus[0].Metadata[doc1.Metadata.Id].Title);
            Assert.Equal(doc1.Metadata.Id, corpus[0].Metadata[doc1.Metadata.Id].Id);

            Assert.Equal(1, corpus[1].Documents.Count);
            Assert.Equal(1, corpus[1].Metadata.Count);
            var doc2 = corpus[1].Documents[0];

            Assert.Equal("Another page", doc2.Metadata.Title);
            Assert.Equal("Hello world", doc2.Data);
            Assert.Equal("Another page", corpus[1].Metadata[doc2.Metadata.Id].Title);
            Assert.Equal(doc2.Metadata.Id, corpus[1].Metadata[doc2.Metadata.Id].Id);
        }
Example #2
0
        public void WikiDumpXmlReaderTest()
        {
            var xmlDump = GetXmlDump();

            using var reader = new WikiDumpXmlReader(xmlDump);
            var pages = reader.ReadPages().ToArray();

            Assert.True(pages[0].IsRedirect);
            Assert.False(pages[0].IsSpecial);
            Assert.False(pages[0].IsContent);
            Assert.Equal("RedirectPage", pages[0].Title);
            Assert.Equal("Simple page", pages[0].RedirectTitle);
            Assert.StartsWith("#REDIRECT [[Simple page]]", pages[0].Text);

            Assert.False(pages[1].IsRedirect);
            Assert.False(pages[1].IsSpecial);
            Assert.True(pages[1].IsContent);
            Assert.Equal("Simple page", pages[1].Title);
            Assert.Equal("=Simple Page=\nSome text", pages[1].Text);

            Assert.False(pages[3].IsRedirect);
            Assert.True(pages[3].IsSpecial);
            Assert.False(pages[3].IsContent);
            Assert.Equal("Category:about", pages[3].Title);
            Assert.Equal("This is wikipedia", pages[3].Text);
        }
        static void TransformWikiDump()
        {
            string pathToSave = wikiPath;

            PrepareOutputDirectory(pathToSave);

            using var xmlReader = new WikiDumpXmlReader(wikiDumpFilePath);

            ICorpusReader <string> reader = new WikipediaReader(
                xmlReader,
                WikipediaReader.DefaultFilter,
                (ushort)BlockSize,
                CorpusSize);
            ICorpusWriter <string> writer = new CorpusZipWriter <string>(pathToSave, stringDataSerializer);

            writer.Write(reader.Read());
        }