private void BuildDupFilesList(List <DupFiles> dupsFiles) { verticalListView_results.Controls.Clear(); label_dupResTitle.Text = "Found " + dupsFiles.Count + " duplications:"; foreach (DupFiles df in dupsFiles) { DupResult dupResult = new DupResult(); dupResult.NumOfDups = df.DuplicationsFiles.Count; dupResult.AddOnHandleBtClickListener((s, e) => { DupMatchsForm dmf = new DupMatchsForm(df); dmf.ShowDialog(); }); verticalListView_results.AddControl(dupResult); } }
private static void Perf(IDupDetector dupDetector, int workers, string folder, int times) { var files = Directory.GetFiles(folder, "*.*", SearchOption.AllDirectories); var timer = new Stopwatch(); DupResult result = default; if (times <= 0) { times = 10; } timer.Start(); for (var i = 0; i < times; i++) { result = dupDetector.Find(files, workers); } timer.Stop(); Log(string.Format("Dup method: {0}, workers: {1}, groups: {2}, times: {3}, avg elapse: {4}", dupDetector, workers, result.Duplicates.Count, times, TimeSpan.FromMilliseconds(timer.ElapsedMilliseconds / times)), true); }
public DupResult Find(IEnumerable <string> files, int workers, int quickHashSize, int bufferSize) { var result = new DupResult { Duplicates = new List <Duplicate>(), FailedToProcessFiles = new List <string>(), TotalFiles = files.LongCount() }; var totalComparedFiles = 0L; var totalFileBytes = 0L; var totalReadBytes = 0L; _workers = workers; if (_workers <= 0) { _workers = 5; } if (bufferSize <= 3) { bufferSize = DefaultBufferSize; } if (quickHashSize <= 0) { quickHashSize = DefaulfQuickHashSize; } //groups with same file size var sameSizeGroups = files.Select(f => { try { return(GetDupFileItem(f)); } catch (Exception) { result.FailedToProcessFiles.Add(f); return(null); } }).Where(f => f != null).GroupBy(f => f.Size).Where(g => g.Count() > 1); var mappedSameSizeGroupList = new ConcurrentBag <IGrouping <string, DupItem> >(); Parallel.ForEach(MapFileSizeGroups(sameSizeGroups), mappedSameSizeGroups => { foreach (var group in mappedSameSizeGroups) { foreach (var file in group) { Interlocked.Increment(ref totalComparedFiles); try { //fast random bytes checking QuickHashFile(file, quickHashSize, ref totalFileBytes, ref totalReadBytes); } catch (Exception) { file.Status = CompareStatus.Failed; result.FailedToProcessFiles.Add(file.FileName); } } //groups with same quick hash value var sameQuickHashGroups = group.Where(f => f.Status != CompareStatus.Failed).GroupBy(f => f.QuickHash).Where(g => g.Count() > 1); foreach (var sameQuickHashGroup in sameQuickHashGroups) { mappedSameSizeGroupList.Add(sameQuickHashGroup); } } }); Parallel.ForEach(MapFileHashGroups(mappedSameSizeGroupList), mappedSameSizehGroups => { foreach (var quickHashGroup in mappedSameSizehGroups) { ProgressiveHash(quickHashGroup, bufferSize, ref totalReadBytes); result.FailedToProcessFiles.AddRange(quickHashGroup.Where(f => f.Status == CompareStatus.Failed).Select(f => f.FileName)); //phew, finally..... //group by same file hash var sameFullHashGroups = quickHashGroup.Where(g => g.Status != CompareStatus.Failed).GroupBy(g => g.FullHash).Where(g => g.Count() > 1); result.Duplicates.AddRange(sameFullHashGroups.Select(fullHashGroup => new Duplicate { Items = fullHashGroup.Select(f => new FileItem { FileName = f.FileName, ModifiedTime = f.ModifiedTime, Size = f.Size }) })); } }); result.TotalComparedFiles = totalComparedFiles; result.TotalBytesInComparedFiles = totalFileBytes; result.TotalReadBytes = totalReadBytes; return(result); }
public DupResult Find(IEnumerable <string> files, int workers, int quickHashSize = 3, int bufferSize = 0) { var result = new DupResult { Duplicates = new List <Duplicate>(), FailedToProcessFiles = new List <string>() }; _workers = workers; if (_workers <= 0) { _workers = 5; } if (bufferSize <= 3) { bufferSize = DefaultBufferSize; } //groups with same file size var sameSizeGroups = files.Select(f => { var info = new FileInfo(f); return(new DupItem { FileName = f, ModifiedTime = info.LastWriteTime, Size = info.Length }); }).GroupBy(f => f.Size).Where(g => g.Count() > 1); var mappedSameSizeGroupList = new ConcurrentBag <IGrouping <string, DupItem> >(); Parallel.ForEach(MapFileSizeGroups(sameSizeGroups), mappedSameSizeGroups => { foreach (var group in mappedSameSizeGroups) { foreach (var file in group) { if (file.Size > 0) { //fast random byte checking try { using (var stream = File.Open(file.FileName, FileMode.Open, FileAccess.Read, FileShare.Read)) { var length = stream.Length; file.Tags = new byte[3]; //first byte stream.Seek(0, SeekOrigin.Begin); file.Tags[0] = (byte)stream.ReadByte(); //middle byte, we need it especially for xml like files if (length > 1) { stream.Seek(stream.Length / 2, SeekOrigin.Begin); file.Tags[1] = (byte)stream.ReadByte(); } //last byte if (length > 2) { stream.Seek(0, SeekOrigin.End); file.Tags[2] = (byte)stream.ReadByte(); } file.QuickHash = HashTool.GetHashText(file.Tags); } } catch (Exception) { file.Status = CompareStatus.Failed; result.FailedToProcessFiles.Add(file.FileName); } } } //groups with same quick hash value var sameQuickHashGroups = group.Where(f => f.Status != CompareStatus.Failed).GroupBy(f => f.QuickHash).Where(g => g.Count() > 1); foreach (var sameQuickHashGroup in sameQuickHashGroups) { mappedSameSizeGroupList.Add(sameQuickHashGroup); } } }); Parallel.ForEach(MapFileHashGroups(mappedSameSizeGroupList), mappedSameHashGroups => { foreach (var quickHashGroup in mappedSameHashGroups) { foreach (var groupFile in quickHashGroup) { try { groupFile.FullHash = HashTool.HashFile(groupFile.FileName, bufferSize); } catch (Exception) { result.FailedToProcessFiles.Add(groupFile.FileName); } } //phew, finally..... //group by same file hash var sameFullHashGroups = quickHashGroup.GroupBy(g => g.FullHash).Where(g => g.Count() > 1); result.Duplicates.AddRange(sameFullHashGroups.Select(fullHashGroup => new Duplicate { Items = fullHashGroup.Select(f => new FileItem { FileName = f.FileName, ModifiedTime = f.ModifiedTime, Size = f.Size }) })); } }); return(result); }