示例#1
0
        static void HashWikipedia()
        {
            var outputWikipediaPath = hashedPath;

            PrepareOutputDirectory(outputWikipediaPath);

            var reader = new CorpusZipReader <IEnumerable <string> >(tokenizedPath, tokenizedDataSerializer);
            var writer = new CorpusZipWriter <int[]>(outputWikipediaPath, hashedDataSerializer);

            var hasher = new DocumentHasher();

            hasher.Transform(reader, writer);
        }
示例#2
0
        public void HashDocumentTest()
        {
            var corpus = CreateCorpus();

            var hasher = new DocumentHasher();

            var hashed = hasher.Transform(corpus).First().Documents;

            Assert.Equal(7, hashed[0].Data.Length);
            Assert.Equal(7, hashed[1].Data.Length);
            Assert.NotEqual(hashed[0].Data, hashed[1].Data);

            hashed[0].Data[1] = hashed[1].Data[1];
            hashed[0].Data[5] = hashed[1].Data[5];
            Assert.Equal(hashed[0].Data, hashed[1].Data);
        }
示例#3
0
        public void HashDocumentCorpusTest()
        {
            var corpus = CreateCorpus();

            var hashed = new List <IEnumerable <Block <int[]> > >();

            var reader = new Mock <ICorpusReader <Tokens> >();

            reader.Setup(r => r.Read()).Returns(corpus);

            var writer = new Mock <ICorpusWriter <int[]> >();

            writer.Setup(w => w.Write(It.IsAny <IEnumerable <HashedBlock> >()))
            .Callback((IEnumerable <HashedBlock> d) => hashed.Add(d));

            var tokenizer = new DocumentHasher();

            tokenizer.Transform(reader.Object, writer.Object);

            Assert.Single(hashed);
            var hashedBlock = hashed[0].First();

            Assert.Equal(2, hashedBlock.Documents.Count);
        }