Exemple #1
0
        public void WikipediaReaderTest()
        {
            var xmlDump = GetXmlDump();

            using var xmlReader = new WikiDumpXmlReader(xmlDump);
            var wikiReader = new WikipediaReader(
                xmlReader,
                WikipediaReader.DefaultFilter,
                blockSize: 1);
            var corpus = wikiReader.Read().ToArray();

            Assert.Equal(2, corpus.Length);

            Assert.Equal(1, corpus[0].Documents.Count);
            Assert.Equal(1, corpus[0].Metadata.Count);
            var doc1 = corpus[0].Documents[0];

            Assert.Equal("Simple page", doc1.Metadata.Title);
            Assert.Equal("=Simple Page=\nSome text", doc1.Data);
            Assert.Equal("Simple page", corpus[0].Metadata[doc1.Metadata.Id].Title);
            Assert.Equal(doc1.Metadata.Id, corpus[0].Metadata[doc1.Metadata.Id].Id);

            Assert.Equal(1, corpus[1].Documents.Count);
            Assert.Equal(1, corpus[1].Metadata.Count);
            var doc2 = corpus[1].Documents[0];

            Assert.Equal("Another page", doc2.Metadata.Title);
            Assert.Equal("Hello world", doc2.Data);
            Assert.Equal("Another page", corpus[1].Metadata[doc2.Metadata.Id].Title);
            Assert.Equal(doc2.Metadata.Id, corpus[1].Metadata[doc2.Metadata.Id].Id);
        }
        static void TransformWikiDump()
        {
            string pathToSave = wikiPath;

            PrepareOutputDirectory(pathToSave);

            using var xmlReader = new WikiDumpXmlReader(wikiDumpFilePath);

            ICorpusReader <string> reader = new WikipediaReader(
                xmlReader,
                WikipediaReader.DefaultFilter,
                (ushort)BlockSize,
                CorpusSize);
            ICorpusWriter <string> writer = new CorpusZipWriter <string>(pathToSave, stringDataSerializer);

            writer.Write(reader.Read());
        }