/// <summary> /// Quick check to make sure we dont have any dupe md5's /// </summary> /// <param name="sig"></param> /// <returns></returns> private Boolean VerifySignature(SizeBasedCompleteSignature sig) { var myDict = new Dictionary <string, int>(); var valid = true; var count = 0; foreach (var size in sig.Signatures.Keys) { foreach (var sSig in sig.Signatures[size].SignatureList) { var md5 = sSig.MD5Signature; var md5Str = ByteArrayToString(md5); if (myDict.ContainsKey(md5Str)) { valid = false; count++; } else { myDict[md5Str] = 1; } } } return(valid); }
/// <summary> /// Existing blocks + sigs are in searchResults /// new /// </summary> /// <param name="bytesUploaded"></param> /// <returns></returns> internal static SizeBasedCompleteSignature CreateSignatureFromNewAndReusedBlocks(List <UploadedBlock> allBlocks) { var sigDict = new Dictionary <int, List <BlockSignature> >(); List <BlockSignature> sigList; // new blocks foreach (var newBlock in allBlocks) { if (!sigDict.TryGetValue((int)newBlock.Sig.Size, out sigList)) { sigList = new List <BlockSignature>(); sigDict[(int)newBlock.Sig.Size] = sigList; } // add sig to the list. sigList.Add(newBlock.Sig); } var sizedBaseSignature = new SizeBasedCompleteSignature(); sizedBaseSignature.Signatures = new Dictionary <int, CompleteSignature>(); foreach (var key in sigDict.Keys) { var compSig = new CompleteSignature() { SignatureList = sigDict[key].ToArray() }; sizedBaseSignature.Signatures[key] = compSig; } return(sizedBaseSignature); }
private void PopulateSignatureTree(SizeBasedCompleteSignature sig, Dictionary <TreeNode, List <BlockSignature> > sigDict, TreeView sigTV) { sigTV.Nodes.Clear(); bool isLeftTree; if (sigTV.Name == "sigTreeView") { isLeftTree = true; } else { isLeftTree = false; } bothFilesShared = 0; PopulateSignatureTreeByOffset(sigTV, sig, sigDict); sharedSize.Text = bothFilesShared.ToString("N0"); newSize.Text = (file2Size - bothFilesShared).ToString("N0"); if (isLeftTree) { file1TotalSize.Text = file1Size.ToString("N0"); sigTV.Update(); } else { file2TotalSize.Text = file2Size.ToString("N0"); sigTV.Update(); } }
private long CalculateFileSize(SizeBasedCompleteSignature sig) { long fileSize = 0; foreach (var sigSize in sig.Signatures.Keys) { fileSize += (long)sigSize * (long)sig.Signatures[sigSize].SignatureList.Count(); } return(fileSize); }
/// <summary> /// Merge smaller blocks into something at least fragmentMergeSize bytes long. /// Only upload at most maxUploadLimit (0 == no limit). /// Should this be in CommonOps? /// Lame... really? DEFRAG? Then again I suppose the term IS appropriate. /// </summary> /// <param name="containerName"></param> /// <param name="blobName"></param> public void DefragBlob(SizeBasedCompleteSignature blobSig, long maxUploadLimitMB = 2) { var allBlobSigs = blobSig.Signatures.Values.SelectMany(x => x.SignatureList).OrderBy(a => a.Offset).ToList(); var targetSigSize = ConfigHelper.SignatureSize; // loop through sigs, merge what we can but dont exceed maxUploadLimit long bytesToUpload = 0; var byteRangesToUpload = new List <RemainingBytes>(); var defragNodeList = new List <DefragNode>(); for (var i = 0; i < allBlobSigs.Count; i++) { uint sigSize = 0; var j = i; while (j < allBlobSigs.Count) { var sig = allBlobSigs[j]; j++; // break if we get too big. if (sigSize + sig.Size > targetSigSize) { break; } sigSize += sig.Size; } defragNodeList.Add(new DefragNode { Offset = allBlobSigs[i].Offset, Size = sigSize, SigPos = i, NoSigs = j - i - 1 }); } // defragNodeList is a list of sigs, and size. These ones will be merged. var sortedList = defragNodeList.OrderByDescending(n => n.NoSigs).ToList(); // the entries in defragNodeList that has the max number of sigs in it (ie most fragmentation) will be the ones to get merged. foreach (var sig in sortedList) { DefragSigGroup(blobSig, sig); bytesToUpload += sig.Size; if (bytesToUpload > maxUploadLimitMB) { break; } } }
public void UploadSignatureForBlob(string blobName, string containerName, SizeBasedCompleteSignature sig) { var client = AzureHelper.GetCloudBlobClient(); var container = client.GetContainerReference(containerName); // upload sig. var sigBlobName = AzureHelper.SetSignatureName(containerName, blobName); var sigBlob = container.GetBlockBlobReference(sigBlobName); using (Stream s = new MemoryStream()) { SerializationHelper.WriteBinarySizedBasedSignature(sig, s); s.Seek(0, SeekOrigin.Begin); sigBlob.UploadFromStream(s); } }
public static void WriteBinarySizedBasedSignature(SizeBasedCompleteSignature sig, Stream s) { var writer = new BinaryWriter(s); int numberOfSizes = sig.Signatures.Keys.Count; // 4 bytes. Number of key sizes. writer.Write(numberOfSizes); foreach (int keySize in sig.Signatures.Keys) { // write key size. writer.Write(keySize); var completeSigForKeySize = sig.Signatures[keySize]; int numberOfEntries = completeSigForKeySize.SignatureList.Length; // number of entries for this key size. writer.Write(numberOfEntries); foreach (var i in completeSigForKeySize.SignatureList) { // 8 bytes writer.Write(i.Offset); // 4 bytes writer.Write(i.Size); // 4 bytes. writer.Write(i.BlockNumber); // 8 bytes. writer.Write(i.RollingSig.Sig1); // 8 bytes. writer.Write(i.RollingSig.Sig2); // should be 16 bytes. foreach (byte b in i.MD5Signature) { writer.Write(b); } } } }
public static SizeBasedCompleteSignature CreateSignatureForLocalFile(string localFilePath) { var sig = new SizeBasedCompleteSignature(); var buffer = new byte[ConfigHelper.SignatureSize]; var sigDict = new Dictionary <int, List <BlockSignature> >(); using (var fs = new FileStream(localFilePath, FileMode.Open)) { long offset = 0; uint idCount = 0; int bytesRead = 0; while ((bytesRead = fs.Read(buffer, 0, ConfigHelper.SignatureSize)) > 0) { var blockSig = GenerateBlockSig(buffer, offset, bytesRead, idCount); List <BlockSignature> sigList; if (!sigDict.TryGetValue(bytesRead, out sigList)) { sigList = new List <BlockSignature>(); sigDict[bytesRead] = sigList; } sigList.Add(blockSig); offset += bytesRead; idCount++; } } var sizedBaseSignature = new SizeBasedCompleteSignature(); sizedBaseSignature.Signatures = new Dictionary <int, CompleteSignature>(); foreach (var key in sigDict.Keys) { var compSig = new CompleteSignature() { SignatureList = sigDict[key].ToArray() }; sizedBaseSignature.Signatures[key] = compSig; } return(sizedBaseSignature); }
private void PopulateSignatureTreeByOffset(TreeView sigTV, SizeBasedCompleteSignature sig, Dictionary <TreeNode, List <BlockSignature> > sigDict) { var sigList = new List <BlockSignature>(); if (sig.Signatures != null) { foreach (var size in sig.Signatures.Keys) { foreach (var sSig in sig.Signatures[size].SignatureList) { sigList.Add(sSig); } } var sortedSigList = (from s in sigList orderby s.Offset select s).ToList <BlockSignature>(); PopulateRootNodes(sigTV, sortedSigList, sigDict); } }
private void ProcessDoubleClick(TreeNode selectedNode, SizeBasedCompleteSignature sig) { if (selectedNode != null && selectedNode.Parent != null) { var sp = selectedNode.Parent.Text.Split(); var sigSize = Convert.ToInt32(sp[0]); var offset = Convert.ToInt64(selectedNode.Text.Split()[0]); var specificSig = (from s in sig.Signatures[sigSize].SignatureList where s.Offset == offset select s).First <BlockSignature>(); var md5String = ByteArrayToString(specificSig.MD5Signature); var rollingSig = string.Format("{0}:{1}", specificSig.RollingSig.Sig1, specificSig.RollingSig.Sig2); var msg = string.Format("Offset: {0}\nSize: {1}\nRollingSig: {2}\nMD5: {3}", specificSig.Offset.ToString(), specificSig.Size.ToString(), rollingSig, md5String); var dialog = MessageBox.Show(msg); } }
/// <summary> /// Gets SizeBasedCompleteSignature. /// Format is: first 4 bytes are number of CompleteSig's there are. /// For each complete sig, the format is 4 bytes, number of entries. /// </summary> /// <param name="s"></param> /// <returns></returns> public static SizeBasedCompleteSignature ReadSizeBasedBinarySignature(Stream s) { var sig = new SizeBasedCompleteSignature(); sig.Signatures = new Dictionary <int, CompleteSignature>(); // always go to beginning of stream. s.Seek(0, SeekOrigin.Begin); var reader = new BinaryReader(s); int numberOfCompleteSignatures = reader.ReadInt32(); for (var i = 0; i < numberOfCompleteSignatures; i++) { int keySize = reader.ReadInt32(); var completeSig = ReadBinaryCompleteSignature(s); sig.Signatures[keySize] = completeSig; } return(sig); }
/// <summary> /// Loads the sig file. /// </summary> /// <param name="filename"></param> private void LoadSigFile(string filename, ref SizeBasedCompleteSignature sig, Dictionary <TreeNode, List <BlockSignature> > sigDict, TreeView sigTV) { using (var fs = new FileStream(filename, FileMode.Open)) { sig = SerializationHelper.ReadSizeBasedBinarySignature(fs); VerifySignature(sig); if (sigTV.Name == "sigTreeView") { sig1MD5Dict = GenerateMD5DictFromSig(sig); file1Size = CalculateFileSize(sig); } else { sig2MD5Dict = GenerateMD5DictFromSig(sig); file2Size = CalculateFileSize(sig); } } bothFilesShared = 0; PopulateSignatureTree(sig, sigDict, sigTV); }
// defrags a group of sigs... merges them together. private void DefragSigGroup(SizeBasedCompleteSignature blobSig, DefragNode sig) { }
private List <RemainingBytes> GenerateByteRangesOfBlobToDownload(List <BlockSignature> sigsToReuseList, SizeBasedCompleteSignature cloudBlobSig, string containerName, string blobName) { var blobSize = AzureHelper.GetBlobSize(containerName, blobName); var remainingBytesList = new List <RemainingBytes>(); var allBlobSigs = cloudBlobSig.Signatures.Values.SelectMany(x => x.SignatureList).OrderBy(a => a.Offset).ToList(); var sortedSigs = (from sig in sigsToReuseList orderby sig.Offset ascending select sig).ToList(); long startOffsetToCopy = 0; // loop through all cloudBlobSigs. // If have a match in sigsToReuse, skip it. // otherwise, take note of offset and size to download. foreach (var sig in allBlobSigs) { var haveMatchingSig = sigsToReuseList.Any(s => s.MD5Signature.SequenceEqual(sig.MD5Signature)); if (!haveMatchingSig) { // if no match then we need to copy everything from startOffsetToCopy to sig.Offset + sig.Size remainingBytesList.Add(new RemainingBytes() { BeginOffset = startOffsetToCopy, EndOffset = sig.Offset + sig.Size - 1 }); startOffsetToCopy = sig.Offset + sig.Size; } else { // we have a match therefore dont need to copy the data. // change startOffsetToCopy to just after current sig. startOffsetToCopy = sig.Offset + sig.Size; } } return(remainingBytesList); }
// regenerate blob locally. // we need to either download byte ranges from Azure. // OR // need to copy from local file. private void RegenerateBlob(string containerName, string blobName, List <RemainingBytes> byteRangesToDownload, string localFilePath, List <BlockSignature> reusableBlockSignatures, SizeBasedCompleteSignature blobSig, int parallelFactor = 2) { // removing size from the equation. var allBlobSigs = blobSig.Signatures.Values.SelectMany(x => x.SignatureList).OrderBy(a => a.Offset).ToList(); // LUT to see if block is to be reused or not. var reusableBlockDict = CommonOps.GenerateBlockDict(reusableBlockSignatures.ToArray()); var offset = 0L; using (var localStream = new FileStream(localFilePath, FileMode.Open)) using (var newStream = new FileStream(localFilePath + ".new", FileMode.Create)) { // go through all sigs in offset order.... determine if can reuse or need to download. foreach (var sig in allBlobSigs) { var haveMatch = false; if (reusableBlockDict.ContainsKey(sig.RollingSig)) { // have a match... so will reuse local file. var localSig = reusableBlockDict[sig.RollingSig]; var matchingLocalSigs = localSig.Where(s => s.MD5Signature.SequenceEqual(sig.MD5Signature)) .Select(n => n) .ToList(); if (matchingLocalSigs.Any()) { // have a match. var matchingLocalSig = matchingLocalSigs[0]; // huge amount of wasted allocations... maybe move this. var buffer = new byte[matchingLocalSig.Size]; localStream.Seek(matchingLocalSig.Offset, SeekOrigin.Begin); localStream.Read(buffer, 0, (int)matchingLocalSig.Size); newStream.Seek(sig.Offset, SeekOrigin.Begin); newStream.Write(buffer, 0, (int)matchingLocalSig.Size); haveMatch = true; offset += matchingLocalSig.Size; } } if (!haveMatch) { // check if we have byte ranges starting at offset. var byteRange = (from b in byteRangesToDownload where b.BeginOffset == offset select b).FirstOrDefault(); if (byteRange != null) { // download bytes. var blobBytes = DownloadBytes(containerName, blobName, byteRange.BeginOffset, byteRange.EndOffset, parallelFactor); newStream.Seek(sig.Offset, SeekOrigin.Begin); newStream.Write(blobBytes, 0, (int)(byteRange.EndOffset - byteRange.BeginOffset + 1)); offset += (byteRange.EndOffset - byteRange.BeginOffset + 1); } } } } // rename .new file to original File.Replace(localFilePath + ".new", localFilePath, null); }
public static SignatureSearchResult SearchLocalFileForSignatures(string localFilePath, SizeBasedCompleteSignature sig) { var result = new SignatureSearchResult(); // length of file. var tempFile = File.Open(localFilePath, FileMode.Open); var fileLength = tempFile.Length; tempFile.Close(); var offset = 0; var windowSize = ConfigHelper.SignatureSize; var windowBuffer = new byte[windowSize]; // signatures we can reuse. var signaturesToReuse = new List <BlockSignature>(); // get sizes of signatures (block sizes) from existing sig. // then loop through all sizes looking for matches in local file. // important to search from largest to smallest. var signatureSizes = sig.Signatures.Keys.ToList(); signatureSizes.Sort(); signatureSizes.Reverse(); // byte ranges that have not been matched to existing blocks yet. var remainingByteList = new List <RemainingBytes>(); remainingByteList.Add(new RemainingBytes { BeginOffset = 0, EndOffset = fileLength - 1 }); // Create the memory-mapped file. using (var mmf = MemoryMappedFile.CreateFromFile(localFilePath, FileMode.Open)) { using (var accessor = mmf.CreateViewAccessor()) { // Any sigs smaller than 100 bytes? skip? // Valid? // Really want to avoid searching for single bytes everywhere. foreach (var sigSize in signatureSizes) { var sigs = sig.Signatures[sigSize]; var newRemainingByteList = SearchLocalFileForSignaturesBasedOnSize(sigs, accessor, remainingByteList, sigSize, fileLength, signaturesToReuse); remainingByteList = newRemainingByteList; } } } result.ByteRangesToUpload = remainingByteList; result.SignaturesToReuse = signaturesToReuse; return(result); }