public void ProcessStream(Stream stream, BlockingCollectionDataChunk<int> positions, BlockingCollectionDataChunk<uint> partitionHashValues, BlockingCollectionDataChunk<FileChunkInfo> fileChunkInfo) { var prevPos = 0; var neededBytes = 0; foreach (var posChunk in positions.BlockingCollection.GetConsumingEnumerable()) { for (var i = 0; i < posChunk.DataSize; ++i ) { var pos = posChunk.Data[i]; neededBytes = pos - prevPos + 1; var hv = CalcStreamPortion(stream, neededBytes); partitionHashValues.Add(hv); fileChunkInfo.Add(new FileChunkInfo() { Pos = prevPos, Length = neededBytes }); // Prepare for next block. prevPos = pos + 1; } } // Handle the last partition neededBytes = (int)stream.Length - prevPos; if (neededBytes != 0) { var lastHv = CalcStreamPortion(stream, neededBytes); partitionHashValues.Add(lastHv); fileChunkInfo.Add(new FileChunkInfo() { Pos = prevPos, Length = neededBytes }); } partitionHashValues.CompleteAdding(); fileChunkInfo.CompleteAdding(); }
public void StreamToHashValues(Stream inputStream, BlockingCollectionDataChunk<uint> hashValues) { // Read the source file into a byte array. var prevBuffer = new byte[BufferSize]; var buffer = new byte[BufferSize]; var starting = true; var prevHashValue = 0U; var byteRead = 0; var currHashEndIdx = -1; var hEndIdx = -1; while ((byteRead = inputStream.Read(buffer, 0, BufferSize)) != 0) { if (starting) { var hv = Adler32Checksum.Calculate(buffer, 0, HashBlock); hashValues.Add(hv); prevHashValue = hv; starting = false; currHashEndIdx = HashBlock - 1; } if (byteRead < BufferSize) { Array.Clear(buffer, byteRead, BufferSize - byteRead); } hEndIdx = byteRead + HashBlock - 2; while (currHashEndIdx < BufferSize - 1 && currHashEndIdx != hEndIdx) { CalcAdlerHashEndIdx(ref currHashEndIdx, prevBuffer, buffer, ref prevHashValue, hashValues); } currHashEndIdx -= BufferSize; hEndIdx -= BufferSize; // Swap 2 buffers var temp = buffer; buffer = prevBuffer; prevBuffer = temp; } if (currHashEndIdx != hEndIdx) { // Need an empty final block. Array.Clear(buffer, 0, BufferSize); while (currHashEndIdx != hEndIdx) { CalcAdlerHashEndIdx(ref currHashEndIdx, prevBuffer, buffer, ref prevHashValue, hashValues); } } hashValues.CompleteAdding(); }
public void CalcNew(BlockingCollectionDataChunk<uint> inputList, BlockingCollectionDataChunk<int> outputPos) { var currBlockPos = 0; foreach (var item in inputList.BlockingCollection.GetConsumingEnumerable()) { for (var i = 0; i < item.DataSize; ++i) { var currHash = item.Data[i]; if (currHash % (2 * 512) == 0) { outputPos.Add(currBlockPos + i); } } currBlockPos += item.DataSize; } outputPos.CompleteAdding(); }
static void ProcessFileNaive(string filename, List<int> partitionHash) { var rollingHash = new List<uint>(); var localMaximaPos = new List<int>(); var fciBC = new BlockingCollectionDataChunk<FileChunkInfo>(); var fileBytes = File.ReadAllBytes(filename); using (var ms = new MemoryStream(fileBytes, 0, fileBytes.Length, true, true)) { var fh = new FileHash(1024); fh.StreamToHashValuesNaive(ms, rollingHash); } var lm = new LocalMaxima(4 * 1024); lm.CalcUsingNaive(rollingHash, localMaximaPos); var localMaximaPosBC = new BlockingCollectionDataChunk<int>(); foreach (var pos in localMaximaPos) { localMaximaPosBC.Add(pos); } localMaximaPosBC.CompleteAdding(); var ph = new BlockingCollectionDataChunk<uint>(); var mmh = new MurmurHash3_x86_32(); var fph = new FileParitionHash(mmh); using (var fs = new FileStream(filename, FileMode.Open, FileAccess.Read, FileShare.Read)) { fph.ProcessStream(fs, localMaximaPosBC, ph, fciBC); } foreach (var items in ph.BlockingCollection.GetConsumingEnumerable()) { for (var i = 0; i < items.DataSize; ++i) { partitionHash.Add((int)items.Data[i]); } } }
public void CalcUsingBlockAlgo(BlockingCollectionDataChunk<uint> inputList, BlockingCollectionDataChunk<int> outputPos) { var currPos = 0; var liveCanPrevBlockIdx = -1; var liveCanPrevBlockVal = 0U; var currBlockGreedySeq = new List<KeyValuePair<int, uint>>(); var prevBlockGreedySeq = new List<KeyValuePair<int, uint>>(); var currBlock = new uint[BlockSize]; var prevBlock = new uint[BlockSize]; var currWritingIdx = 0; foreach (var item in inputList.BlockingCollection.GetConsumingEnumerable()) { var nItemLeft = item.DataSize; while (nItemLeft != 0) { // Constructing current block. var copyLength = Math.Min(nItemLeft, BlockSize - currWritingIdx); Array.Copy(item.Data, item.DataSize - nItemLeft, currBlock, currWritingIdx, copyLength); nItemLeft -= copyLength; currWritingIdx += copyLength; if (currWritingIdx == BlockSize) { var currLiveCanIdx = ProcessOneBlock(currBlock, currPos, currBlockGreedySeq, prevBlock, liveCanPrevBlockIdx, liveCanPrevBlockVal, prevBlockGreedySeq, outputPos); // Move on to the next block. prevBlockGreedySeq = currBlockGreedySeq; currBlockGreedySeq = new List<KeyValuePair<int, uint>>(); liveCanPrevBlockIdx = currLiveCanIdx; if (liveCanPrevBlockIdx != -1) { liveCanPrevBlockVal = currBlock[liveCanPrevBlockIdx]; } var temp = currBlock; currBlock = prevBlock; prevBlock = temp; currWritingIdx = 0; currPos += BlockSize; } } } if (currWritingIdx != 0) { // Handle non-full last block. // Zero the remaining array. Array.Clear(currBlock, currWritingIdx, currBlock.Length - currWritingIdx); var currLiveCanIdx = ProcessOneBlock(currBlock, currPos, currBlockGreedySeq, prevBlock, liveCanPrevBlockIdx, liveCanPrevBlockVal, prevBlockGreedySeq, outputPos); liveCanPrevBlockIdx = currLiveCanIdx; if (liveCanPrevBlockIdx != -1) { liveCanPrevBlockVal = currBlock[liveCanPrevBlockIdx]; } currPos += BlockSize; } if (liveCanPrevBlockIdx != -1) { outputPos.Add(currPos - prevBlock.Length + liveCanPrevBlockIdx); } outputPos.CompleteAdding(); }
private int ProcessOneBlock(IList<uint> currBlock, int currBlockStartPos, List<KeyValuePair<int, uint>> currGreedySeq, IList<uint> prevBlock, int liveCanPrevBlockIdx, uint liveCanPrevBlockVal, List<KeyValuePair<int, uint>> prevGreddySeq, BlockingCollectionDataChunk<int> localMaximaPos) { var currBlockStart = 0; var currBlockEnd = currBlock.Count - 1; currGreedySeq.Clear(); currGreedySeq.Add(new KeyValuePair<int, uint>(currBlockEnd, currBlock[currBlockEnd])); var currLiveCanIdx = currBlockEnd; if (liveCanPrevBlockIdx == -1) { // No live candidate in the previous block, do the ordinary run. OrdinaryRun(currBlock, currBlockStart, currBlockEnd - 1, currGreedySeq, ref currLiveCanIdx); } else { // Live candidate in the prev block, do modified run on this block // Ordinary until m + h (exclusive) OrdinaryRun(currBlock, liveCanPrevBlockIdx, currBlockEnd - 1, currGreedySeq, ref currLiveCanIdx); var lastInCurrGreedySeq = currGreedySeq[currGreedySeq.Count - 1].Value; var modIdx = liveCanPrevBlockIdx - 1; if (lastInCurrGreedySeq >= liveCanPrevBlockVal) { // F(g) >= F(m) while (modIdx >= currBlockStart) { if (currBlock[modIdx] >= liveCanPrevBlockVal) { // Kill m liveCanPrevBlockIdx = -1; break; } --modIdx; } OrdinaryRun(currBlock, currBlockStart, modIdx, currGreedySeq, ref currLiveCanIdx); } else { // F(g) < F(m) var lastValue = currGreedySeq[currGreedySeq.Count - 1].Value; while (modIdx >= currBlockStart) { if (currBlock[modIdx] > lastValue) { // Strictly greater than, add to the greedy sequence. currGreedySeq.Add(new KeyValuePair<int, uint>(modIdx, currBlock[modIdx])); lastValue = currBlock[modIdx]; currLiveCanIdx = modIdx; if (currBlock[modIdx] >= liveCanPrevBlockVal) { // Kill m liveCanPrevBlockIdx = -1; --modIdx; break; } } else if (currBlock[modIdx] == lastValue) { // Equal: kill the current candidate but don't add to the greedy sequence. currLiveCanIdx = -1; } --modIdx; } OrdinaryRun(currBlock, currBlockStart, modIdx, currGreedySeq, ref currLiveCanIdx); } // After the modified run, if the candidate in the previous block is still alive => add to output. if (liveCanPrevBlockIdx != -1) { localMaximaPos.Add(currBlockStartPos - prevBlock.Count + liveCanPrevBlockIdx); } } // Check if current candidate satisfies all applicable items in the previous block. if (currLiveCanIdx != -1) { for (var i = 0; i < prevGreddySeq.Count; ++i) { if (prevGreddySeq[i].Key < currLiveCanIdx + 1) { break; } if (prevGreddySeq[i].Value >= currBlock[currLiveCanIdx]) { currLiveCanIdx = -1; break; } } } return currLiveCanIdx; }
List<KeyValuePair<int, int>> CalcUsingBlockAlgo(List<int> list) { var retPos = new BlockingCollectionDataChunk<int>(); var inList = new BlockingCollectionDataChunk<uint>(); foreach (var i in list) { inList.Add((uint)i); } inList.CompleteAdding(); CalcUsingBlockAlgo(inList, retPos); return retPos.ToList().Select(pos => new KeyValuePair<int, int>(pos, list[pos])).ToList(); }
private void CalcHashEndIdxUInt32(ref int currHashEndIdx, byte[] buffer, ref uint prevHashValue, BlockingCollectionDataChunk<uint> hashValues) { currHashEndIdx += 1; var inByte = buffer[currHashEndIdx]; var bitMask = (uint)(inByte << 24); var hv = (prevHashValue >> 8) | bitMask; hashValues.Add(hv); prevHashValue = hv; }
private void CalcAdlerHashEndIdx(ref int currHashEndIdx, byte[] prevBuffer, byte[] buffer, ref uint prevHashValue, BlockingCollectionDataChunk<uint> hashValues) { currHashEndIdx += 1; var hashStartIdx = currHashEndIdx - HashBlock; var outByte = hashStartIdx < 0 ? prevBuffer[BufferSize + hashStartIdx] : buffer[hashStartIdx]; var hv = Adler32Checksum.Roll(outByte, buffer[currHashEndIdx], prevHashValue, HashBlock); hashValues.Add(hv); prevHashValue = hv; }
public void StreamToUInt32HashValues(Stream inputStream, BlockingCollectionDataChunk<uint> hashValues) { // Read the source file into a byte array. var buffer = new byte[BufferSize]; var starting = true; var prevHashValue = 0U; var byteRead = 0; var currHashEndIdx = -1; var hEndIdx = -1; while ((byteRead = inputStream.Read(buffer, 0, BufferSize)) != 0) { if (starting) { var hv = BitConverter.ToUInt32(buffer, 0); hashValues.Add(hv); prevHashValue = hv; starting = false; currHashEndIdx = 4 - 1; } if (byteRead < BufferSize) { Array.Clear(buffer, byteRead, BufferSize - byteRead); } hEndIdx = byteRead + 4 - 2; while (currHashEndIdx < BufferSize - 1 && currHashEndIdx != hEndIdx) { CalcHashEndIdxUInt32(ref currHashEndIdx, buffer, ref prevHashValue, hashValues); } currHashEndIdx -= BufferSize; hEndIdx -= BufferSize; } if (currHashEndIdx != hEndIdx) { // Need an empty final block. Array.Clear(buffer, 0, BufferSize); while (currHashEndIdx != hEndIdx) { CalcHashEndIdxUInt32(ref currHashEndIdx, buffer, ref prevHashValue, hashValues); } } hashValues.CompleteAdding(); }