public void WikipediaReaderTest() { var xmlDump = GetXmlDump(); using var xmlReader = new WikiDumpXmlReader(xmlDump); var wikiReader = new WikipediaReader( xmlReader, WikipediaReader.DefaultFilter, blockSize: 1); var corpus = wikiReader.Read().ToArray(); Assert.Equal(2, corpus.Length); Assert.Equal(1, corpus[0].Documents.Count); Assert.Equal(1, corpus[0].Metadata.Count); var doc1 = corpus[0].Documents[0]; Assert.Equal("Simple page", doc1.Metadata.Title); Assert.Equal("=Simple Page=\nSome text", doc1.Data); Assert.Equal("Simple page", corpus[0].Metadata[doc1.Metadata.Id].Title); Assert.Equal(doc1.Metadata.Id, corpus[0].Metadata[doc1.Metadata.Id].Id); Assert.Equal(1, corpus[1].Documents.Count); Assert.Equal(1, corpus[1].Metadata.Count); var doc2 = corpus[1].Documents[0]; Assert.Equal("Another page", doc2.Metadata.Title); Assert.Equal("Hello world", doc2.Data); Assert.Equal("Another page", corpus[1].Metadata[doc2.Metadata.Id].Title); Assert.Equal(doc2.Metadata.Id, corpus[1].Metadata[doc2.Metadata.Id].Id); }
static void TransformWikiDump() { string pathToSave = wikiPath; PrepareOutputDirectory(pathToSave); using var xmlReader = new WikiDumpXmlReader(wikiDumpFilePath); ICorpusReader <string> reader = new WikipediaReader( xmlReader, WikipediaReader.DefaultFilter, (ushort)BlockSize, CorpusSize); ICorpusWriter <string> writer = new CorpusZipWriter <string>(pathToSave, stringDataSerializer); writer.Write(reader.Read()); }