static void HashWikipedia() { var outputWikipediaPath = hashedPath; PrepareOutputDirectory(outputWikipediaPath); var reader = new CorpusZipReader <IEnumerable <string> >(tokenizedPath, tokenizedDataSerializer); var writer = new CorpusZipWriter <int[]>(outputWikipediaPath, hashedDataSerializer); var hasher = new DocumentHasher(); hasher.Transform(reader, writer); }
public void HashDocumentTest() { var corpus = CreateCorpus(); var hasher = new DocumentHasher(); var hashed = hasher.Transform(corpus).First().Documents; Assert.Equal(7, hashed[0].Data.Length); Assert.Equal(7, hashed[1].Data.Length); Assert.NotEqual(hashed[0].Data, hashed[1].Data); hashed[0].Data[1] = hashed[1].Data[1]; hashed[0].Data[5] = hashed[1].Data[5]; Assert.Equal(hashed[0].Data, hashed[1].Data); }
public void HashDocumentCorpusTest() { var corpus = CreateCorpus(); var hashed = new List <IEnumerable <Block <int[]> > >(); var reader = new Mock <ICorpusReader <Tokens> >(); reader.Setup(r => r.Read()).Returns(corpus); var writer = new Mock <ICorpusWriter <int[]> >(); writer.Setup(w => w.Write(It.IsAny <IEnumerable <HashedBlock> >())) .Callback((IEnumerable <HashedBlock> d) => hashed.Add(d)); var tokenizer = new DocumentHasher(); tokenizer.Transform(reader.Object, writer.Object); Assert.Single(hashed); var hashedBlock = hashed[0].First(); Assert.Equal(2, hashedBlock.Documents.Count); }