Example #1
0
        /// <summary>
        /// Step 2: calculate crc
        /// </summary>
        /// <param name="fileList"></param>
        /// <param name="option"></param>
        /// <returns>List DuplicateArchiveInfo</returns>
        private List <DuplicateArchiveInfo> CalculateCRC(List <FileInfo> fileList, DuplicateSearchOption option)
        {
            List <DuplicateArchiveInfo> list = new List <DuplicateArchiveInfo>();

            int i = 0;

            foreach (FileInfo f in fileList)
            {
                _pauseEvent.WaitOne(Timeout.Infinite);
                if (_shutdownEvent.WaitOne(0))
                {
                    break;
                }

                //NotifyCaller(f.FullName, OperationStatus.CALCULATING_CRC, curr: i, total: fileList.Count);
                //string msg = String.Format("File {0} of {1}", i, fileList.Count);
                NotifyCaller("", OperationStatus.CALCULATING_CRC, curr: i, total: fileList.Count);
                try
                {
                    DuplicateArchiveInfo item = Util.GetArchiveInfo(f.FullName, option);
                    item.FileSize     = f.Length;
                    item.CreationTime = f.CreationTime;

                    list.Add(item);
                }
                catch (Exception ex)
                {
                    string message = ex.Message + " (" + f.FullName + ")";
                    NotifyCaller(message, OperationStatus.ERROR);
                }
                ++i;
            }

            NotifyCaller("Complete calculating CRC, total: " + list.Count, OperationStatus.CALCULATING_CRC, total: list.Count);

            return(list);
        }
Example #2
0
        /// <summary>
        /// Check if file is duplicated
        /// </summary>
        /// <param name="original"></param>
        /// <param name="duplicate"></param>
        /// <param name="option"></param>
        /// <returns></returns>
        private bool Compare(ref DuplicateArchiveInfo original, ref DuplicateArchiveInfo duplicate, DuplicateSearchOption option)
        {
            lock (original)
            {
                //NotifyCaller("Comparing: " + Origin.Filename + " to " + Duplicate.Filename, OperationStatus.COMPARING);

                // if the match type already changed from original, skip it
                // most likely already validated by other task
                if (original.MatchType != MatchType.ORIGINAL)
                {
                    return(false);
                }

                // if item count is equal, try to check from crc strings.

                original.MatchType  = MatchType.ORIGINAL;
                original.Percentage = 0.0;
                if (original.NoMatches != null)
                {
                    original.NoMatches.Clear();
                }

                if (original.Items.Count == duplicate.Items.Count)
                {
                    if (original.ToCRCString() == duplicate.ToCRCString())
                    {
                        //NotifyCaller("CRC Strings are equal.", OperationStatus.COMPARING);
                        duplicate.Percentage = 100.0;
                        duplicate.MatchType  = MatchType.EQUALCOUNT;
                        return(true);
                    }
                    else if (option.OnlyPerfectMatch)
                    {
                        return(false);
                    }
                }

                // Check each files in duplicate
                int limitCount;

                // if only have 'IgnoreLimit' files, then all must match
                if (option.IgnoreLimit > duplicate.Items.Count)
                {
                    limitCount = 0;
                }
                else
                {
                    limitCount = duplicate.Items.Count - (duplicate.Items.Count * option.Limit / 100);
                }

                int skippedCount = 0;
                int i            = 0;
                int j            = 0;
                while (i < original.Items.Count && j < duplicate.Items.Count && skippedCount <= limitCount)
                {
                    // compare the from the biggest crc.
                    int result = string.Compare(original.Items[i].Crc, duplicate.Items[j].Crc, true, System.Globalization.CultureInfo.InvariantCulture);
                    if (result == 0)
                    {
                        ++i; ++j;
                    }
                    else if (result > 0)
                    {
                        // Origin file skipped
                        ++i;
                    }
                    else
                    {
                        // Duplicate file skipped, no match in Origin
                        ++skippedCount;
                        if (duplicate.NoMatches == null)
                        {
                            duplicate.NoMatches = new List <ArchiveFileInfoSmall>();
                        }
                        duplicate.NoMatches.Add(duplicate.Items[j]);
                        ++j;
                    }
                }

                if (j < duplicate.Items.Count)
                {
                    if (duplicate.NoMatches == null)
                    {
                        duplicate.NoMatches = new List <ArchiveFileInfoSmall>();
                    }
                    duplicate.NoMatches.AddRange(duplicate.Items.GetRange(j, duplicate.Items.Count - j));
                    skippedCount = duplicate.NoMatches.Count;
                }

                double percent = (double)(duplicate.Items.Count - skippedCount) / duplicate.Items.Count * 100;
                if (percent >= option.Limit && skippedCount < limitCount)
                {
                    //NotifyCaller("Match: " + percent + "%", OperationStatus.COMPARING);
                    duplicate.Percentage = percent;
                    duplicate.MatchType  = MatchType.SUBSET;
                    return(true);
                }

                //NotifyCaller("Not Match", OperationStatus.COMPARING);
                if (duplicate.NoMatches != null)
                {
                    duplicate.NoMatches.Clear();
                }
                return(false);
            }
        }
Example #3
0
        /// <summary>
        /// Step 3: Build duplicate list
        /// </summary>
        /// <param name="list"></param>
        /// <param name="limit"></param>
        /// <param name="ignoreLimit"></param>
        private List <DuplicateArchiveInfoList> BuildDuplicateList(List <DuplicateArchiveInfo> list, DuplicateSearchOption option)
        {
            NotifyCaller("Start building duplicate list.", OperationStatus.BUILDING_DUPLICATE_LIST);

            List <DuplicateArchiveInfoList> dupList = new List <DuplicateArchiveInfoList>();

            list.Sort(new DuplicateArchiveInfoItemCountComparer());

            int totalCount = list.Count;
            int i          = 0;

            while (list.Count > 0)
            {
                _pauseEvent.WaitOne(Timeout.Infinite);
                if (_shutdownEvent.WaitOne(0))
                {
                    NotifyCaller("Stopping...", OperationStatus.BUILDING_DUPLICATE_LIST);
                    break;
                }

                ++i;
                DuplicateArchiveInfoList dup      = new DuplicateArchiveInfoList();
                DuplicateArchiveInfo     original = list[0];
                list.RemoveAt(0);
                dup.Original = original;

                string message = "Checking: " + original.Filename + " ( Duplicate group found: " + i + " Remaining: " + list.Count + ")";
                //NotifyCaller(message, OperationStatus.BUILDING_DUPLICATE_LIST, curr: i, total: totalCount);
                NotifyCaller("", OperationStatus.BUILDING_DUPLICATE_LIST, curr: i, total: totalCount);

                // parallel method
                if (option.TaskLimit > 1)
                {
                    var taskScheduler = new Nandaka.Common.LimitedConcurrencyLevelTaskScheduler(option.TaskLimit, 16);
                    var pOption       = new ParallelOptions()
                    {
                        TaskScheduler = taskScheduler
                    };

                    Parallel.For(0, list.Count, pOption, (innerIdx) =>
                    {
                        DuplicateArchiveInfo curr = list[innerIdx];
                        if (curr.IsRemoved)
                        {
                            return;
                        }

                        if (Compare(ref original, ref curr, option))
                        {
                            if (dup.Duplicates == null)
                            {
                                dup.Duplicates = new List <DuplicateArchiveInfo>();
                            }
                            // remove from the source list.
                            lock (list)
                            {
                                curr.IsRemoved = true;
                            }
                            dup.Duplicates.Add(curr);
                        }
                    });
                }
                else
                {
                    // check for other possible dups.
                    int index = 0;
                    while (list.Count > index)
                    {
                        DuplicateArchiveInfo curr = list[index];
                        if (Compare(ref original, ref curr, option))
                        {
                            if (dup.Duplicates == null)
                            {
                                dup.Duplicates = new List <DuplicateArchiveInfo>();
                            }
                            dup.Duplicates.Add(curr);
                            // remove from the source list.
                            list.Remove(curr);
                            --totalCount;
                        }
                        else
                        {
                            ++index;
                        }
                    }
                }
                if (dup.Duplicates != null && dup.Duplicates.Count > 0)
                {
                    dupList.Add(dup);
                }
            }

            foreach (DuplicateArchiveInfoList dup in dupList)
            {
                if (dup.Duplicates != null)
                {
                    dup.Duplicates.Sort(new DuplicateArchiveInfoPercentageComparer());
                }
            }
            NotifyCaller("Building Duplicate List Complete.", OperationStatus.BUILDING_DUPLICATE_LIST);

            return(dupList);
        }
Example #4
0
        /// <summary>
        /// Build DuplicateArchiveInfo containing the files' crc, sorted.
        /// </summary>
        /// <param name="filename"></param>
        /// <param name="blackListPattern"></param>
        /// <returns></returns>
        public static DuplicateArchiveInfo GetArchiveInfo(string filename, DuplicateSearchOption option)
        {
            Regex re = new Regex(option.BlacklistPattern, option.BlacklistCaseInsensitive ? RegexOptions.IgnoreCase : RegexOptions.None);
            DuplicateArchiveInfo info = new DuplicateArchiveInfo();

            SevenZipExtractor.SetLibraryPath(option.SevenZipPath);
            using (SevenZipExtractor extractor = new SevenZipExtractor(filename))
            {
                info.Filename     = filename;
                info.Items        = new List <ArchiveFileInfoSmall>();
                info.RealSize     = extractor.UnpackedSize;
                info.ArchivedSize = extractor.PackedSize;

                ulong countedSize = 0;

                foreach (ArchiveFileInfo af in extractor.ArchiveFileData)
                {
                    if (af.IsDirectory)
                    {
                        info.DirectoryCount++;
                        continue;
                    }

                    ArchiveFileInfoSmall item = new ArchiveFileInfoSmall()
                    {
                        Crc      = ConvertToHexString(af.Crc),
                        Filename = af.FileName,
                        Size     = af.Size
                    };
                    if (!String.IsNullOrWhiteSpace(option.BlacklistPattern) && re.IsMatch(af.FileName))
                    {
                        if (info.Skipped == null)
                        {
                            info.Skipped = new List <ArchiveFileInfoSmall>();
                        }
                        item.Remark = "Blacklisted";
                        info.Skipped.Add(item);
                    }
                    else if (option.IgnoreSmallFile && item.Size < option.SmallFileSizeLimit)
                    {
                        if (info.Skipped == null)
                        {
                            info.Skipped = new List <ArchiveFileInfoSmall>();
                        }
                        item.Remark = "SmallFileSizeLimit";
                        info.Skipped.Add(item);
                    }
                    else
                    {
                        item.Remark = "";
                        info.Items.Add(item);
                    }
                    countedSize += af.Size;
                }

                if (info.RealSize == -1)
                {
                    info.RealSize = Convert.ToInt64(countedSize);
                }

                info.SortItems();
            }
            return(info);
        }