Пример #1
0
        public void ProcessStream(Stream stream, BlockingCollectionDataChunk<int> positions,
            BlockingCollectionDataChunk<uint> partitionHashValues, BlockingCollectionDataChunk<FileChunkInfo> fileChunkInfo)
        {
            var prevPos = 0;
            var neededBytes = 0;

            foreach (var posChunk in positions.BlockingCollection.GetConsumingEnumerable())
            {
                for (var i = 0; i < posChunk.DataSize; ++i )
                {
                    var pos = posChunk.Data[i];
                    neededBytes = pos - prevPos + 1;
                    var hv = CalcStreamPortion(stream, neededBytes);
                    partitionHashValues.Add(hv);
                    fileChunkInfo.Add(new FileChunkInfo() { Pos = prevPos, Length = neededBytes });
                    // Prepare for next block.
                    prevPos = pos + 1;
                }
            }
            // Handle the last partition
            neededBytes = (int)stream.Length - prevPos;
            if (neededBytes != 0)
            {
                var lastHv = CalcStreamPortion(stream, neededBytes);
                partitionHashValues.Add(lastHv);
                fileChunkInfo.Add(new FileChunkInfo() { Pos = prevPos, Length = neededBytes });
            }
            partitionHashValues.CompleteAdding();
            fileChunkInfo.CompleteAdding();
        }
Пример #2
0
        public void StreamToHashValues(Stream inputStream, BlockingCollectionDataChunk<uint> hashValues)
        {
            // Read the source file into a byte array.
            var prevBuffer = new byte[BufferSize];
            var buffer = new byte[BufferSize];
            var starting = true;
            var prevHashValue = 0U;

            var byteRead = 0;
            var currHashEndIdx = -1;
            var hEndIdx = -1;

            while ((byteRead = inputStream.Read(buffer, 0, BufferSize)) != 0)
            {
                if (starting)
                {
                    var hv = Adler32Checksum.Calculate(buffer, 0, HashBlock);
                    hashValues.Add(hv);
                    prevHashValue = hv;
                    starting = false;
                    currHashEndIdx = HashBlock - 1;
                }
                if (byteRead < BufferSize)
                {
                    Array.Clear(buffer, byteRead, BufferSize - byteRead);
                }
                hEndIdx = byteRead + HashBlock - 2;

                while (currHashEndIdx < BufferSize - 1 && currHashEndIdx != hEndIdx)
                {
                    CalcAdlerHashEndIdx(ref currHashEndIdx, prevBuffer, buffer, ref prevHashValue, hashValues);
                }
                currHashEndIdx -= BufferSize;
                hEndIdx -= BufferSize;
                // Swap 2 buffers
                var temp = buffer;
                buffer = prevBuffer;
                prevBuffer = temp;
            }
            if (currHashEndIdx != hEndIdx)
            {
                // Need an empty final block.
                Array.Clear(buffer, 0, BufferSize);

                while (currHashEndIdx != hEndIdx)
                {
                    CalcAdlerHashEndIdx(ref currHashEndIdx, prevBuffer, buffer, ref prevHashValue, hashValues);
                }
            }
            hashValues.CompleteAdding();
        }
Пример #3
0
 public void CalcNew(BlockingCollectionDataChunk<uint> inputList, BlockingCollectionDataChunk<int> outputPos)
 {
     var currBlockPos = 0;
     foreach (var item in inputList.BlockingCollection.GetConsumingEnumerable())
     {
         for (var i = 0; i < item.DataSize; ++i)
         {
             var currHash = item.Data[i];
             if (currHash % (2 * 512) == 0)
             {
                 outputPos.Add(currBlockPos + i);
             }
         }
         currBlockPos += item.DataSize;
     }
     outputPos.CompleteAdding();
 }
Пример #4
0
        static void ProcessFileNaive(string filename, List<int> partitionHash)
        {
            var rollingHash = new List<uint>();
            var localMaximaPos = new List<int>();
            var fciBC = new BlockingCollectionDataChunk<FileChunkInfo>();

            var fileBytes = File.ReadAllBytes(filename);
            using (var ms = new MemoryStream(fileBytes, 0, fileBytes.Length, true, true))
            {
                var fh = new FileHash(1024);
                fh.StreamToHashValuesNaive(ms, rollingHash);
            }

            var lm = new LocalMaxima(4 * 1024);
            lm.CalcUsingNaive(rollingHash, localMaximaPos);

            var localMaximaPosBC = new BlockingCollectionDataChunk<int>();
            foreach (var pos in localMaximaPos)
            {
                localMaximaPosBC.Add(pos);
            }
            localMaximaPosBC.CompleteAdding();

            var ph = new BlockingCollectionDataChunk<uint>();
            var mmh = new MurmurHash3_x86_32();
            var fph = new FileParitionHash(mmh);
            using (var fs = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read))
            {
                fph.ProcessStream(fs, localMaximaPosBC, ph, fciBC);
            }

            foreach (var items in ph.BlockingCollection.GetConsumingEnumerable())
            {
                for (var i = 0; i < items.DataSize; ++i)
                {
                    partitionHash.Add((int)items.Data[i]);
                }
            }
        }
Пример #5
0
        static void ProcessFile(string filename, List<int> partitionHash, List<FileChunkInfo> fci)
        {
            var rollingHash = new BlockingCollectionDataChunk<uint>();
            var localMaximaPos = new BlockingCollectionDataChunk<int>();
            var ph = new BlockingCollectionDataChunk<uint>();
            var fciBC = new BlockingCollectionDataChunk<FileChunkInfo>();

            //var sw = new Stopwatch();
            //sw.Start();

            //var fLength = 0;
            //using (var fs = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read))
            //{
            //    fLength = (int)fs.Length;
            //}

            //var lmWindow = fLength / (512);
            var lmWindow = 32 * 1024;

            Task.Run(() =>
            {
                using (var fs = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read))
                {
                    var fh = new FileHash(16);
                    fh.StreamToHashValues(fs, rollingHash);
                }
            });

            Task.Run(() =>
            {
                var lm = new LocalMaxima(lmWindow);
                lm.CalcUsingBlockAlgo(rollingHash, localMaximaPos);
            });

            Task.Run(() =>
            {
                var mmh = new MurmurHash3_x86_32();
                var fph = new FileParitionHash(mmh);
                using (var fs = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read))
                {
                    fph.ProcessStream(fs, localMaximaPos, ph, fciBC);
                }
            });

            var count = 0;
            foreach (var items in ph.BlockingCollection.GetConsumingEnumerable())
            {
                count += items.DataSize;
                //Console.WriteLine("File par hash: {0}", i);
                for (var i = 0; i < items.DataSize; ++i)
                {
                    partitionHash.Add((int)items.Data[i]);
                }
            }

            foreach (var items in fciBC.BlockingCollection.GetConsumingEnumerable())
            {
                for (var i = 0; i < items.DataSize; ++i)
                {
                    fci.Add(items.Data[i]);
                }
            }
            //sw.Stop();

            //Console.WriteLine("Number of partitions: {0}", count);
            //Console.WriteLine("Time: {0} ms", sw.ElapsedMilliseconds);
        }
Пример #6
0
        public void CalcUsingBlockAlgo(BlockingCollectionDataChunk<uint> inputList, BlockingCollectionDataChunk<int> outputPos)
        {
            var currPos = 0;

            var liveCanPrevBlockIdx = -1;
            var liveCanPrevBlockVal = 0U;
            var currBlockGreedySeq = new List<KeyValuePair<int, uint>>();
            var prevBlockGreedySeq = new List<KeyValuePair<int, uint>>();
            var currBlock = new uint[BlockSize];
            var prevBlock = new uint[BlockSize];
            var currWritingIdx = 0;

            foreach (var item in inputList.BlockingCollection.GetConsumingEnumerable())
            {
                var nItemLeft = item.DataSize;
                while (nItemLeft != 0)
                {
                    // Constructing current block.
                    var copyLength = Math.Min(nItemLeft, BlockSize - currWritingIdx);
                    Array.Copy(item.Data, item.DataSize - nItemLeft, currBlock, currWritingIdx, copyLength);
                    nItemLeft -= copyLength;
                    currWritingIdx += copyLength;

                    if (currWritingIdx == BlockSize)
                    {
                        var currLiveCanIdx = ProcessOneBlock(currBlock, currPos, currBlockGreedySeq,
                            prevBlock, liveCanPrevBlockIdx, liveCanPrevBlockVal, prevBlockGreedySeq,
                            outputPos);

                        // Move on to the next block.
                        prevBlockGreedySeq = currBlockGreedySeq;
                        currBlockGreedySeq = new List<KeyValuePair<int, uint>>();
                        liveCanPrevBlockIdx = currLiveCanIdx;
                        if (liveCanPrevBlockIdx != -1)
                        {
                            liveCanPrevBlockVal = currBlock[liveCanPrevBlockIdx];
                        }

                        var temp = currBlock;
                        currBlock = prevBlock;
                        prevBlock = temp;

                        currWritingIdx = 0;
                        currPos += BlockSize;
                    }
                }
            }
            if (currWritingIdx != 0)
            {
                // Handle non-full last block.
                // Zero the remaining array.
                Array.Clear(currBlock, currWritingIdx, currBlock.Length - currWritingIdx);

                var currLiveCanIdx = ProcessOneBlock(currBlock, currPos, currBlockGreedySeq,
                    prevBlock, liveCanPrevBlockIdx, liveCanPrevBlockVal, prevBlockGreedySeq,
                    outputPos);

                liveCanPrevBlockIdx = currLiveCanIdx;
                if (liveCanPrevBlockIdx != -1)
                {
                    liveCanPrevBlockVal = currBlock[liveCanPrevBlockIdx];
                }
                currPos += BlockSize;
            }
            if (liveCanPrevBlockIdx != -1)
            {
                outputPos.Add(currPos - prevBlock.Length + liveCanPrevBlockIdx);
            }
            outputPos.CompleteAdding();
        }
Пример #7
0
        private int ProcessOneBlock(IList<uint> currBlock, int currBlockStartPos, List<KeyValuePair<int, uint>> currGreedySeq,
            IList<uint> prevBlock, int liveCanPrevBlockIdx, uint liveCanPrevBlockVal, List<KeyValuePair<int, uint>> prevGreddySeq,
            BlockingCollectionDataChunk<int> localMaximaPos)
        {
            var currBlockStart = 0;
            var currBlockEnd = currBlock.Count - 1;

            currGreedySeq.Clear();
            currGreedySeq.Add(new KeyValuePair<int, uint>(currBlockEnd, currBlock[currBlockEnd]));
            var currLiveCanIdx = currBlockEnd;

            if (liveCanPrevBlockIdx == -1)
            {
                // No live candidate in the previous block, do the ordinary run.
                OrdinaryRun(currBlock, currBlockStart, currBlockEnd - 1, currGreedySeq, ref currLiveCanIdx);
            }
            else
            {
                // Live candidate in the prev block, do modified run on this block
                // Ordinary until m + h (exclusive)
                OrdinaryRun(currBlock, liveCanPrevBlockIdx, currBlockEnd - 1, currGreedySeq, ref currLiveCanIdx);

                var lastInCurrGreedySeq = currGreedySeq[currGreedySeq.Count - 1].Value;
                var modIdx = liveCanPrevBlockIdx - 1;

                if (lastInCurrGreedySeq >= liveCanPrevBlockVal)
                {
                    // F(g) >= F(m)
                    while (modIdx >= currBlockStart)
                    {
                        if (currBlock[modIdx] >= liveCanPrevBlockVal)
                        {
                            // Kill m
                            liveCanPrevBlockIdx = -1;
                            break;
                        }
                        --modIdx;
                    }
                    OrdinaryRun(currBlock, currBlockStart, modIdx, currGreedySeq, ref currLiveCanIdx);
                }
                else
                {
                    // F(g) < F(m)
                    var lastValue = currGreedySeq[currGreedySeq.Count - 1].Value;
                    while (modIdx >= currBlockStart)
                    {
                        if (currBlock[modIdx] > lastValue)
                        {
                            // Strictly greater than, add to the greedy sequence.
                            currGreedySeq.Add(new KeyValuePair<int, uint>(modIdx, currBlock[modIdx]));
                            lastValue = currBlock[modIdx];
                            currLiveCanIdx = modIdx;
                            if (currBlock[modIdx] >= liveCanPrevBlockVal)
                            {
                                // Kill m
                                liveCanPrevBlockIdx = -1;
                                --modIdx;
                                break;
                            }
                        }
                        else if (currBlock[modIdx] == lastValue)
                        {
                            // Equal: kill the current candidate but don't add to the greedy sequence.
                            currLiveCanIdx = -1;
                        }
                        --modIdx;
                    }
                    OrdinaryRun(currBlock, currBlockStart, modIdx, currGreedySeq, ref currLiveCanIdx);
                }
                // After the modified run, if the candidate in the previous block is still alive => add to output.
                if (liveCanPrevBlockIdx != -1)
                {
                    localMaximaPos.Add(currBlockStartPos - prevBlock.Count + liveCanPrevBlockIdx);
                }
            }

            // Check if current candidate satisfies all applicable items in the previous block.
            if (currLiveCanIdx != -1)
            {
                for (var i = 0; i < prevGreddySeq.Count; ++i)
                {
                    if (prevGreddySeq[i].Key < currLiveCanIdx + 1)
                    {
                        break;
                    }
                    if (prevGreddySeq[i].Value >= currBlock[currLiveCanIdx])
                    {
                        currLiveCanIdx = -1;
                        break;
                    }
                }
            }

            return currLiveCanIdx;
        }
Пример #8
0
        List<KeyValuePair<int, int>> CalcUsingBlockAlgo(List<int> list)
        {
            var retPos = new BlockingCollectionDataChunk<int>();
            var inList = new BlockingCollectionDataChunk<uint>();
            foreach (var i in list)
            {
                inList.Add((uint)i);
            }
            inList.CompleteAdding();

            CalcUsingBlockAlgo(inList, retPos);

            return retPos.ToList().Select(pos => new KeyValuePair<int, int>(pos, list[pos])).ToList();
        }
Пример #9
0
        private void CalcHashEndIdxUInt32(ref int currHashEndIdx, byte[] buffer, ref uint prevHashValue, BlockingCollectionDataChunk<uint> hashValues)
        {
            currHashEndIdx += 1;
            var inByte = buffer[currHashEndIdx];
            var bitMask = (uint)(inByte << 24);

            var hv = (prevHashValue >> 8) | bitMask;

            hashValues.Add(hv);
            prevHashValue = hv;
        }
Пример #10
0
        private void CalcAdlerHashEndIdx(ref int currHashEndIdx, byte[] prevBuffer, byte[] buffer, ref uint prevHashValue, BlockingCollectionDataChunk<uint> hashValues)
        {
            currHashEndIdx += 1;
            var hashStartIdx = currHashEndIdx - HashBlock;

            var outByte = hashStartIdx < 0 ? prevBuffer[BufferSize + hashStartIdx] : buffer[hashStartIdx];
            var hv = Adler32Checksum.Roll(outByte, buffer[currHashEndIdx], prevHashValue, HashBlock);

            hashValues.Add(hv);
            prevHashValue = hv;
        }
Пример #11
0
        public void StreamToUInt32HashValues(Stream inputStream, BlockingCollectionDataChunk<uint> hashValues)
        {
            // Read the source file into a byte array.
            var buffer = new byte[BufferSize];
            var starting = true;
            var prevHashValue = 0U;

            var byteRead = 0;
            var currHashEndIdx = -1;
            var hEndIdx = -1;

            while ((byteRead = inputStream.Read(buffer, 0, BufferSize)) != 0)
            {
                if (starting)
                {
                    var hv = BitConverter.ToUInt32(buffer, 0);
                    hashValues.Add(hv);
                    prevHashValue = hv;
                    starting = false;
                    currHashEndIdx = 4 - 1;
                }
                if (byteRead < BufferSize)
                {
                    Array.Clear(buffer, byteRead, BufferSize - byteRead);
                }
                hEndIdx = byteRead + 4 - 2;

                while (currHashEndIdx < BufferSize - 1 && currHashEndIdx != hEndIdx)
                {
                    CalcHashEndIdxUInt32(ref currHashEndIdx, buffer, ref prevHashValue, hashValues);
                }
                currHashEndIdx -= BufferSize;
                hEndIdx -= BufferSize;
            }
            if (currHashEndIdx != hEndIdx)
            {
                // Need an empty final block.
                Array.Clear(buffer, 0, BufferSize);

                while (currHashEndIdx != hEndIdx)
                {
                    CalcHashEndIdxUInt32(ref currHashEndIdx, buffer, ref prevHashValue, hashValues);
                }
            }
            hashValues.CompleteAdding();
        }