private static MockFileSystem SerializeCorpus(IList <Block <string> > corpus)
        {
            var fileSystem = new MockFileSystem();

            fileSystem.Directory.CreateDirectory(path);
            var writer = new CorpusZipWriter <string>(path, new StringDocumentDataSerializer(), fileSystem);

            writer.Write(corpus);
            return(fileSystem);
        }
Exemplo n.º 2
0
        static void CleanWikitext()
        {
            var outputWikipediaPath = cleanedPath;

            PrepareOutputDirectory(outputWikipediaPath);

            var reader = new CorpusZipReader <IList <char> >(wikiPath, charDataSerializer);
            var writer = new CorpusZipWriter <IList <char> >(outputWikipediaPath, charDataSerializer);

            var cleaner = new CorpusTransformer <IList <char>, IList <char> >(WikitextCleaner.Clean);

            cleaner.Transform(reader, writer);
        }
Exemplo n.º 3
0
        static void TokenizeWikipedia()
        {
            var outputWikipediaPath = tokenizedPath;

            PrepareOutputDirectory(outputWikipediaPath);

            var reader = new CorpusZipReader <IList <char> >(cleanedPath, charDataSerializer);
            var writer = new CorpusZipWriter <IList <char> >(outputWikipediaPath, charDataSerializer);

            var tokenizer = new CorpusTransformer <IList <char>, IList <char> >(t => StateMachineTokenizer.Tokenize(t, lowerCase: true));

            tokenizer.Transform(reader, writer);
        }
Exemplo n.º 4
0
        static void HashWikipedia()
        {
            var outputWikipediaPath = hashedPath;

            PrepareOutputDirectory(outputWikipediaPath);

            var reader = new CorpusZipReader <IEnumerable <string> >(tokenizedPath, tokenizedDataSerializer);
            var writer = new CorpusZipWriter <int[]>(outputWikipediaPath, hashedDataSerializer);

            var hasher = new DocumentHasher();

            hasher.Transform(reader, writer);
        }
Exemplo n.º 5
0
        static void TransformWikiDump()
        {
            string pathToSave = wikiPath;

            PrepareOutputDirectory(pathToSave);

            using var xmlReader = new WikiDumpXmlReader(wikiDumpFilePath);

            ICorpusReader <string> reader = new WikipediaReader(
                xmlReader,
                WikipediaReader.DefaultFilter,
                (ushort)BlockSize,
                CorpusSize);
            ICorpusWriter <string> writer = new CorpusZipWriter <string>(pathToSave, stringDataSerializer);

            writer.Write(reader.Read());
        }