/// <summary> /// Step 2: calculate crc /// </summary> /// <param name="fileList"></param> /// <param name="option"></param> /// <returns>List DuplicateArchiveInfo</returns> private List <DuplicateArchiveInfo> CalculateCRC(List <FileInfo> fileList, DuplicateSearchOption option) { List <DuplicateArchiveInfo> list = new List <DuplicateArchiveInfo>(); int i = 0; foreach (FileInfo f in fileList) { _pauseEvent.WaitOne(Timeout.Infinite); if (_shutdownEvent.WaitOne(0)) { break; } //NotifyCaller(f.FullName, OperationStatus.CALCULATING_CRC, curr: i, total: fileList.Count); //string msg = String.Format("File {0} of {1}", i, fileList.Count); NotifyCaller("", OperationStatus.CALCULATING_CRC, curr: i, total: fileList.Count); try { DuplicateArchiveInfo item = Util.GetArchiveInfo(f.FullName, option); item.FileSize = f.Length; item.CreationTime = f.CreationTime; list.Add(item); } catch (Exception ex) { string message = ex.Message + " (" + f.FullName + ")"; NotifyCaller(message, OperationStatus.ERROR); } ++i; } NotifyCaller("Complete calculating CRC, total: " + list.Count, OperationStatus.CALCULATING_CRC, total: list.Count); return(list); }
/// <summary> /// Check if file is duplicated /// </summary> /// <param name="original"></param> /// <param name="duplicate"></param> /// <param name="option"></param> /// <returns></returns> private bool Compare(ref DuplicateArchiveInfo original, ref DuplicateArchiveInfo duplicate, DuplicateSearchOption option) { lock (original) { //NotifyCaller("Comparing: " + Origin.Filename + " to " + Duplicate.Filename, OperationStatus.COMPARING); // if the match type already changed from original, skip it // most likely already validated by other task if (original.MatchType != MatchType.ORIGINAL) { return(false); } // if item count is equal, try to check from crc strings. original.MatchType = MatchType.ORIGINAL; original.Percentage = 0.0; if (original.NoMatches != null) { original.NoMatches.Clear(); } if (original.Items.Count == duplicate.Items.Count) { if (original.ToCRCString() == duplicate.ToCRCString()) { //NotifyCaller("CRC Strings are equal.", OperationStatus.COMPARING); duplicate.Percentage = 100.0; duplicate.MatchType = MatchType.EQUALCOUNT; return(true); } else if (option.OnlyPerfectMatch) { return(false); } } // Check each files in duplicate int limitCount; // if only have 'IgnoreLimit' files, then all must match if (option.IgnoreLimit > duplicate.Items.Count) { limitCount = 0; } else { limitCount = duplicate.Items.Count - (duplicate.Items.Count * option.Limit / 100); } int skippedCount = 0; int i = 0; int j = 0; while (i < original.Items.Count && j < duplicate.Items.Count && skippedCount <= limitCount) { // compare the from the biggest crc. int result = string.Compare(original.Items[i].Crc, duplicate.Items[j].Crc, true, System.Globalization.CultureInfo.InvariantCulture); if (result == 0) { ++i; ++j; } else if (result > 0) { // Origin file skipped ++i; } else { // Duplicate file skipped, no match in Origin ++skippedCount; if (duplicate.NoMatches == null) { duplicate.NoMatches = new List <ArchiveFileInfoSmall>(); } duplicate.NoMatches.Add(duplicate.Items[j]); ++j; } } if (j < duplicate.Items.Count) { if (duplicate.NoMatches == null) { duplicate.NoMatches = new List <ArchiveFileInfoSmall>(); } duplicate.NoMatches.AddRange(duplicate.Items.GetRange(j, duplicate.Items.Count - j)); skippedCount = duplicate.NoMatches.Count; } double percent = (double)(duplicate.Items.Count - skippedCount) / duplicate.Items.Count * 100; if (percent >= option.Limit && skippedCount < limitCount) { //NotifyCaller("Match: " + percent + "%", OperationStatus.COMPARING); duplicate.Percentage = percent; duplicate.MatchType = MatchType.SUBSET; return(true); } //NotifyCaller("Not Match", OperationStatus.COMPARING); if (duplicate.NoMatches != null) { duplicate.NoMatches.Clear(); } return(false); } }
/// <summary> /// Step 3: Build duplicate list /// </summary> /// <param name="list"></param> /// <param name="limit"></param> /// <param name="ignoreLimit"></param> private List <DuplicateArchiveInfoList> BuildDuplicateList(List <DuplicateArchiveInfo> list, DuplicateSearchOption option) { NotifyCaller("Start building duplicate list.", OperationStatus.BUILDING_DUPLICATE_LIST); List <DuplicateArchiveInfoList> dupList = new List <DuplicateArchiveInfoList>(); list.Sort(new DuplicateArchiveInfoItemCountComparer()); int totalCount = list.Count; int i = 0; while (list.Count > 0) { _pauseEvent.WaitOne(Timeout.Infinite); if (_shutdownEvent.WaitOne(0)) { NotifyCaller("Stopping...", OperationStatus.BUILDING_DUPLICATE_LIST); break; } ++i; DuplicateArchiveInfoList dup = new DuplicateArchiveInfoList(); DuplicateArchiveInfo original = list[0]; list.RemoveAt(0); dup.Original = original; string message = "Checking: " + original.Filename + " ( Duplicate group found: " + i + " Remaining: " + list.Count + ")"; //NotifyCaller(message, OperationStatus.BUILDING_DUPLICATE_LIST, curr: i, total: totalCount); NotifyCaller("", OperationStatus.BUILDING_DUPLICATE_LIST, curr: i, total: totalCount); // parallel method if (option.TaskLimit > 1) { var taskScheduler = new Nandaka.Common.LimitedConcurrencyLevelTaskScheduler(option.TaskLimit, 16); var pOption = new ParallelOptions() { TaskScheduler = taskScheduler }; Parallel.For(0, list.Count, pOption, (innerIdx) => { DuplicateArchiveInfo curr = list[innerIdx]; if (curr.IsRemoved) { return; } if (Compare(ref original, ref curr, option)) { if (dup.Duplicates == null) { dup.Duplicates = new List <DuplicateArchiveInfo>(); } // remove from the source list. lock (list) { curr.IsRemoved = true; } dup.Duplicates.Add(curr); } }); } else { // check for other possible dups. int index = 0; while (list.Count > index) { DuplicateArchiveInfo curr = list[index]; if (Compare(ref original, ref curr, option)) { if (dup.Duplicates == null) { dup.Duplicates = new List <DuplicateArchiveInfo>(); } dup.Duplicates.Add(curr); // remove from the source list. list.Remove(curr); --totalCount; } else { ++index; } } } if (dup.Duplicates != null && dup.Duplicates.Count > 0) { dupList.Add(dup); } } foreach (DuplicateArchiveInfoList dup in dupList) { if (dup.Duplicates != null) { dup.Duplicates.Sort(new DuplicateArchiveInfoPercentageComparer()); } } NotifyCaller("Building Duplicate List Complete.", OperationStatus.BUILDING_DUPLICATE_LIST); return(dupList); }
/// <summary> /// Build DuplicateArchiveInfo containing the files' crc, sorted. /// </summary> /// <param name="filename"></param> /// <param name="blackListPattern"></param> /// <returns></returns> public static DuplicateArchiveInfo GetArchiveInfo(string filename, DuplicateSearchOption option) { Regex re = new Regex(option.BlacklistPattern, option.BlacklistCaseInsensitive ? RegexOptions.IgnoreCase : RegexOptions.None); DuplicateArchiveInfo info = new DuplicateArchiveInfo(); SevenZipExtractor.SetLibraryPath(option.SevenZipPath); using (SevenZipExtractor extractor = new SevenZipExtractor(filename)) { info.Filename = filename; info.Items = new List <ArchiveFileInfoSmall>(); info.RealSize = extractor.UnpackedSize; info.ArchivedSize = extractor.PackedSize; ulong countedSize = 0; foreach (ArchiveFileInfo af in extractor.ArchiveFileData) { if (af.IsDirectory) { info.DirectoryCount++; continue; } ArchiveFileInfoSmall item = new ArchiveFileInfoSmall() { Crc = ConvertToHexString(af.Crc), Filename = af.FileName, Size = af.Size }; if (!String.IsNullOrWhiteSpace(option.BlacklistPattern) && re.IsMatch(af.FileName)) { if (info.Skipped == null) { info.Skipped = new List <ArchiveFileInfoSmall>(); } item.Remark = "Blacklisted"; info.Skipped.Add(item); } else if (option.IgnoreSmallFile && item.Size < option.SmallFileSizeLimit) { if (info.Skipped == null) { info.Skipped = new List <ArchiveFileInfoSmall>(); } item.Remark = "SmallFileSizeLimit"; info.Skipped.Add(item); } else { item.Remark = ""; info.Items.Add(item); } countedSize += af.Size; } if (info.RealSize == -1) { info.RealSize = Convert.ToInt64(countedSize); } info.SortItems(); } return(info); }