protected DedupNode CanChunkLargeFilesHelper(HashType hashType, int blockSize, int blockCount, string expected)
        {
            var r = new Random(Seed: 0);

            byte[] bytes     = new byte[blockSize];
            byte[] tempBytes = new byte[blockSize];

            using (var mgdHasher = new DedupNodeOrChunkHashAlgorithm(new ManagedChunker(hashType.GetChunkerConfiguration())))
                using (var comHasher = (Chunker.IsComChunkerSupported && hashType == HashType.Dedup64K) ?
                                       new DedupNodeOrChunkHashAlgorithm(new ComChunker(ChunkerConfiguration.SupportedComChunkerConfiguration)) :
                                       null)
                {
                    long totalLength = (long)blockSize * blockCount;
                    mgdHasher.SetInputLength(totalLength);
                    comHasher?.SetInputLength(totalLength);

                    FillBufferWithTestContent(seed: r.Next(), bytes);

                    for (int i = 0; i < blockCount; i++)
                    {
                        Task.WaitAll(
                            Task.Run(() => mgdHasher.TransformBlock(bytes, 0, bytes.Length, null, 0)),
                            Task.Run(() => comHasher?.TransformBlock(bytes, 0, bytes.Length, null, 0)),
                            // Filling the buffer for the next iteration in parallel with actual work
                            // to speed up the tests.
                            Task.Run(
                                () =>
                        {
                            if (i < blockCount - 1)
                            {
                                FillBufferWithTestContent(seed: r.Next(), tempBytes);
                            }
                        })
                            );

                        swap(ref bytes, ref tempBytes);
                    }

                    mgdHasher.TransformFinalBlock(new byte[0], 0, 0);
                    comHasher?.TransformFinalBlock(new byte[0], 0, 0);

                    var node = mgdHasher.GetNode();
                    Assert.Equal <long>(
                        (long)blockSize * blockCount,
                        node.EnumerateChunkLeafsInOrder().Sum(c => (long)c.TransitiveContentBytes));

                    Assert.Equal <string>(expected, node.Hash.ToHex());
                    if (comHasher != null)
                    {
                        Assert.Equal <string>(expected, comHasher.GetNode().Hash.ToHex());
                    }

                    return(node);
                }
Example #2
0
        public void CanEnumerateChunksInChunk()
        {
            DedupNode rootFromhash;

            using (var hasher = new DedupNodeOrChunkHashAlgorithm())
            {
                hasher.SetInputLength(1);
                hasher.ComputeHash(new byte[1]);
                rootFromhash = hasher.GetNode();
            }

            Assert.Equal(rootFromhash.HashString, rootFromhash.EnumerateChunkLeafsInOrder().Single().HashString);
        }
        private void HashOfChunksInNodeMatchesChunkHashAlgorithmInner(int expectedChunkCount, ChunkerConfiguration config, IChunker chunker)
        {
            using (DedupNodeOrChunkHashAlgorithm nodeHasher = new DedupNodeOrChunkHashAlgorithm(chunker))
                using (DedupChunkHashAlgorithm chunkHasher = new DedupChunkHashAlgorithm())
                {
                    byte[] bytes = new byte[expectedChunkCount * config.AvgChunkSize];

                    nodeHasher.SetInputLength(bytes.Length);

                    var r = new Random(Seed: 0);
                    r.NextBytes(bytes);

                    nodeHasher.ComputeHash(bytes, 0, bytes.Length);
                    var node = nodeHasher.GetNode();
                    Assert.NotNull(node.Height);
                    if (expectedChunkCount >= 2 * DedupNode.MaxDirectChildrenPerNode)
                    {
                        Assert.Equal((uint)2, node.Height.Value);
                    }

                    ulong offset     = 0;
                    int   chunkCount = 0;
                    foreach (var chunkInNode in node.EnumerateChunkLeafsInOrder())
                    {
                        byte[] chunkHash = chunkHasher.ComputeHash(bytes, (int)offset, (int)chunkInNode.TransitiveContentBytes);
                        Assert.Equal(chunkHash.ToHex(), chunkInNode.Hash.ToHex());
                        offset     += chunkInNode.TransitiveContentBytes;
                        chunkCount += 1;
                    }

                    Assert.Equal(offset, node.TransitiveContentBytes);

                    double ratio = (1.0 * expectedChunkCount) / chunkCount;
                    Assert.True(Math.Abs(ratio - 1.0) < 0.3); // within 30% of expected
                }
        }
        private void ChunksAndNodesInCommonInSimilarFilesInternal(HashType hashType)
        {
            using var hasher = new DedupNodeOrChunkHashAlgorithm(new ManagedChunker(hashType.GetChunkerConfiguration()));
            byte[] bytes = new byte[50 * 1024 * 1024];

            int offsetForSecondFile = 200 * 1024;

            var r = new Random(Seed: 0);

            r.NextBytes(bytes);

            hasher.SetInputLength(bytes.Length);
            byte[]           hash1   = hasher.ComputeHash(bytes, 0, bytes.Length);
            var              node1   = hasher.GetNode();
            HashSet <string> chunks1 = node1.EnumerateChunkLeafsInOrder().Select(c => c.Hash.ToHex()).ToHashSet();
            HashSet <string> nodes1  = node1.EnumerateInnerNodesDepthFirst().Select(c => c.Hash.ToHex()).ToHashSet();

            hasher.SetInputLength(bytes.Length);
            byte[]           hash2   = hasher.ComputeHash(bytes, offsetForSecondFile, bytes.Length - offsetForSecondFile);
            var              node2   = hasher.GetNode();
            HashSet <string> chunks2 = node2.EnumerateChunkLeafsInOrder().Select(c => c.Hash.ToHex()).ToHashSet();
            HashSet <string> nodes2  = node2.EnumerateInnerNodesDepthFirst().Select(c => c.Hash.ToHex()).ToHashSet();

            Assert.NotEqual(hash1, hash2, ByteArrayComparer.Instance);

            var commonChunks = new HashSet <string>(chunks1);

            commonChunks.IntersectWith(chunks2);
            Assert.Subset(chunks1, commonChunks);
            Assert.Subset(chunks2, commonChunks);
            Assert.InRange(commonChunks.Count, chunks1.Count - (chunks1.Count / 10), chunks1.Count);
            Assert.InRange(commonChunks.Count, chunks2.Count - (chunks2.Count / 10), chunks2.Count);

            var commonNodes = new HashSet <string>(nodes1);

            commonNodes.IntersectWith(nodes2);
            Assert.Subset(nodes1, commonNodes);
            Assert.Subset(nodes2, commonNodes);

            int nodeQueries  = 0;
            int chunkQueries = 0;

            node2.VisitPreorder(n =>
            {
                switch (n.Type)
                {
                case DedupNode.NodeType.ChunkLeaf:
                    chunkQueries++;
                    break;

                case DedupNode.NodeType.InnerNode:
                    nodeQueries++;
                    break;
                }

                return(!nodes1.Contains(n.Hash.ToHex()));
            });

            Assert.Equal(0, commonNodes.Count);
            Assert.Equal(nodeQueries, nodes2.Count);
            Assert.Equal(chunkQueries, chunks2.Count);
        }
Example #5
0
        public void DedupHashFile
        (
            [Required] string[] path,
            [Required] string hashType,
            [DefaultValue(false)] bool chunks,
            [DefaultValue(false)] bool childNodes,
            [DefaultValue(FileSystemConstants.FileIOBufferSize)] int bufferSize,
            [DefaultValue((long)0)] long startOffset
        )
        {
            Initialize();

            _displayChunks     = chunks;
            _displayChildNodes = childNodes;

            if (!Enum.TryParse(hashType, out HashType dedupHashType))
            {
                throw new ArgumentException($"HashType couldn't be inferred - {hashType}. Valid HashType is required.");
            }

            var paths = new List <AbsolutePath>();

            foreach (AbsolutePath root in path.Select(p => new AbsolutePath(Path.GetFullPath(p))))
            {
                if (_fileSystem.DirectoryExists(root))
                {
                    paths.AddRange(_fileSystem.EnumerateFiles(root, EnumerateOptions.Recurse).Select(fileInfo => fileInfo.FullPath));
                }
                else if (_fileSystem.FileExists(root))
                {
                    paths.Add(root);
                }
                else
                {
                    throw new ArgumentException("given path is not an existing file or directory");
                }
            }

            var buffer = new byte[bufferSize];

            using (var contentHasher = new DedupNodeOrChunkHashAlgorithm(new ManagedChunker(dedupHashType.GetChunkerConfiguration())))
            {
                foreach (var p in paths)
                {
                    contentHasher.Initialize();
                    TaskSafetyHelpers.SyncResultOnThreadPool(async() =>
                    {
                        using (Stream fs = await _fileSystem.OpenReadOnlySafeAsync(p, FileShare.Read | FileShare.Delete))
                        {
                            fs.Position = startOffset;
                            int bytesRead;
                            while ((bytesRead = await fs.ReadAsync(buffer, 0, buffer.Length)) > 0)
                            {
                                contentHasher.TransformBlock(buffer, 0, bytesRead, null, 0);
                            }
                            contentHasher.TransformFinalBlock(new byte[0], 0, 0);
                            DedupNode root = contentHasher.GetNode();
                            ulong offset   = 0;
                            LogNode(true, string.Empty, root, p, ref offset);
                        }

                        return(0);
                    });
                }
            }

            _logger.Always("Totals:");
            _logger.Always($"Bytes: Unique={_uniqueBytes:N0} Total={_totalBytes:N0}");
            _logger.Always($"Chunks: Unique={_allChunks.Count:N0} Total={_totalChunks:N0}");
            _logger.Always($"Nodes: Unique={_allNodes.Count:N0} Total={_totalNodes:N0}");
        }
Example #6
0
        private void ChunksEnumeratedAsFileIsRead(Func <IChunker> chunkerFactory, HashType hashType)
        {
            var chunks = new List <ChunkInfo>();

            byte[] bytes;

            using (var chunker = chunkerFactory())
            {
                bytes = new byte[4 * chunker.Configuration.MinPushBufferSize];

                var r = new Random(Seed: 0);
                r.NextBytes(bytes);

                using (var session = chunker.BeginChunking(chunk =>
                {
                    chunks.Add(chunk);
                }))
                {
                    int pushSize       = 2 * chunker.Configuration.MinPushBufferSize;
                    int lastChunkCount = 0;
                    for (int i = 0; i < bytes.Length; i += pushSize)
                    {
                        session.PushBuffer(bytes, i, Math.Min(pushSize, bytes.Length - i));
                        Assert.True(chunks.Count > lastChunkCount);
                        lastChunkCount = chunks.Count;
                    }
                }
            }

            string[] expectedChunkHashes = chunks.Select(c => c.Hash.ToHex()).ToArray();

            DedupNode rootFromhash;

            string[] actualChunkHashes;

            using (var hasher = new DedupNodeOrChunkHashAlgorithm(Chunker.Create(hashType.GetChunkerConfiguration())))
            {
                hasher.SetInputLength(bytes.Length);
                hasher.ComputeHash(bytes);
                rootFromhash      = hasher.GetNode();
                actualChunkHashes = rootFromhash.EnumerateChunkLeafsInOrder().Select(c => c.Hash.ToHex()).ToArray();
                Assert.Equal(expectedChunkHashes, actualChunkHashes);
            }

            var seenNodes = new HashSet <byte[]>(chunks.Select(c => c.Hash), ByteArrayComparer.Instance);

            DedupNode?root = null;

            foreach (var node in PackedDedupNodeTree.EnumerateTree(chunks)
                     .Where(n => n.Type != DedupNode.NodeType.ChunkLeaf))
            {
                foreach (var child in node.ChildNodes)
                {
                    Assert.True(seenNodes.Contains(child.Hash));
                }

                Assert.True(seenNodes.Add(node.Hash));
                root = node;
            }

            Assert.True(root.HasValue);

            // ReSharper disable once PossibleInvalidOperationException
            Assert.Equal(rootFromhash, root.Value);
            actualChunkHashes = root.Value.EnumerateChunkLeafsInOrder().Select(c => c.Hash.ToHex()).ToArray();
            Assert.Equal(expectedChunkHashes, actualChunkHashes);
        }