private static MockFileSystem SerializeCorpus(IList <Block <string> > corpus) { var fileSystem = new MockFileSystem(); fileSystem.Directory.CreateDirectory(path); var writer = new CorpusZipWriter <string>(path, new StringDocumentDataSerializer(), fileSystem); writer.Write(corpus); return(fileSystem); }
static void CleanWikitext() { var outputWikipediaPath = cleanedPath; PrepareOutputDirectory(outputWikipediaPath); var reader = new CorpusZipReader <IList <char> >(wikiPath, charDataSerializer); var writer = new CorpusZipWriter <IList <char> >(outputWikipediaPath, charDataSerializer); var cleaner = new CorpusTransformer <IList <char>, IList <char> >(WikitextCleaner.Clean); cleaner.Transform(reader, writer); }
static void TokenizeWikipedia() { var outputWikipediaPath = tokenizedPath; PrepareOutputDirectory(outputWikipediaPath); var reader = new CorpusZipReader <IList <char> >(cleanedPath, charDataSerializer); var writer = new CorpusZipWriter <IList <char> >(outputWikipediaPath, charDataSerializer); var tokenizer = new CorpusTransformer <IList <char>, IList <char> >(t => StateMachineTokenizer.Tokenize(t, lowerCase: true)); tokenizer.Transform(reader, writer); }
static void HashWikipedia() { var outputWikipediaPath = hashedPath; PrepareOutputDirectory(outputWikipediaPath); var reader = new CorpusZipReader <IEnumerable <string> >(tokenizedPath, tokenizedDataSerializer); var writer = new CorpusZipWriter <int[]>(outputWikipediaPath, hashedDataSerializer); var hasher = new DocumentHasher(); hasher.Transform(reader, writer); }
static void TransformWikiDump() { string pathToSave = wikiPath; PrepareOutputDirectory(pathToSave); using var xmlReader = new WikiDumpXmlReader(wikiDumpFilePath); ICorpusReader <string> reader = new WikipediaReader( xmlReader, WikipediaReader.DefaultFilter, (ushort)BlockSize, CorpusSize); ICorpusWriter <string> writer = new CorpusZipWriter <string>(pathToSave, stringDataSerializer); writer.Write(reader.Read()); }