public static MultiValueDictionary <String, FileInfo> matchByLength(List <FileInfo> candidates)
        {
            MultiValueDictionary <String, FileInfo> checkedFiles = new MultiValueDictionary <string, FileInfo>();

            foreach (FileInfo file in candidates)
            {
                string key = "Length: " + file.Length;
                checkedFiles.Add(key, file);
            }

            return(ComparisonUtils.removeOneElementGroups(checkedFiles));
        }
Example #2
0
        private static MultiValueDictionary <string, FileInfo> matchConstraints(MultiValueDictionary <string, FileInfo> groups, MatcherConstraint constraint)
        {
            string constraintDescription;
            Func <FileInfo, string> getConstraintInfo;

            switch (constraint)
            {
            case MatcherConstraint.SameName:
                constraintDescription = "Filename";
                getConstraintInfo     = info => info.Name;
                break;

            case MatcherConstraint.SameCreationDate:
                constraintDescription = "Creation Date";
                getConstraintInfo     = info => info.CreationTime.ToString();
                break;

            case MatcherConstraint.SameModifiedDate:
                constraintDescription = "Modfied Date";
                getConstraintInfo     = info => info.LastWriteTime.ToString();
                break;

            default:
                throw new ArgumentException("Invalid MatcherConstraint");
            }

            var regrouped = new MultiValueDictionary <string, FileInfo>();;

            foreach (KeyValuePair <string, HashSet <FileInfo> > entry in groups)
            {
                foreach (FileInfo info in entry.Value)
                {
                    string constraintValue = getConstraintInfo.Invoke(info);
                    string constraintKey   = constraintDescription + ": " + constraintValue;
                    string newkey          = entry.Key + ",  " + constraintKey;
                    regrouped.Add(newkey, info);
                }
            }

            return(ComparisonUtils.removeOneElementGroups(regrouped));
        }
Example #3
0
        private static MultiValueDictionary <string, FileInfo> matchIntoGroups(SearchOptions options, List <FileInfo> candidates, CancellationToken cancel)
        {
            switch (options.matcherType)
            {
            case MatcherType.Always:
            {
                var singleGroup = new MultiValueDictionary <string, FileInfo>();
                foreach (FileInfo file in candidates)
                {
                    singleGroup.Add("Any", file);
                }
                return(ComparisonUtils.removeOneElementGroups(singleGroup));
            }

            case MatcherType.Length:
            {
                return(LengthMatcher.matchByLength(candidates));
            }

            case MatcherType.LengthHash:
            {
                return(HashContentMatcher.matchByHashOrContent(candidates, options.matcherType, cancel));
            }

            case MatcherType.LengthHashContent:
            {
                return(HashContentMatcher.matchByHashOrContent(candidates, options.matcherType, cancel));
            }

            case MatcherType.Similarity:
            {
                return(SimilarityMatcher.matchBySimilarity(candidates, options.similarityMatcherThreshold, cancel));
            }

            default:
                throw new ArgumentException("Invalid MatcherType!");
            }
        }
        public static MultiValueDictionary <string, FileInfo> matchByHashOrContent(List <FileInfo> candidates, MatcherType type, CancellationToken cancel)
        {
            if (!(type == MatcherType.LengthHash || type == MatcherType.LengthHashContent))
            {
                throw new ArgumentException("Can only match by lengthHash or lengthHashContent!");
            }

            // group by length
            var lengthToFiles = new MultiValueDictionary <long, FileInfo>();

            foreach (FileInfo file in candidates)
            {
                lengthToFiles.Add(file.Length, file);
            }

            // filter out one element length groups, regroup remaining by hash
            var hashToFiles = new MultiValueDictionary <string, FileInfo>();

            using (var md5 = MD5.Create())
            {
                foreach (KeyValuePair <long, HashSet <FileInfo> > entry in lengthToFiles)
                {
                    if (entry.Value.Count > 1)
                    {
                        foreach (FileInfo file in entry.Value)
                        {
                            using (var stream = File.OpenRead(file.FullName))
                            {
                                byte[] hashBytes = md5.ComputeHash(stream);
                                string hash      = md5ToString(hashBytes);

                                if (type == MatcherType.LengthHash)
                                {
                                    // in case of LengthHash, we make hash key nice for display
                                    hashToFiles.Add("Hash: " + hash, file);
                                }
                                else
                                {
                                    // in case of LengthHashContent, we don't need to becaue they are intermediate
                                    hashToFiles.Add(hash, file);
                                }
                            }
                            if (cancel.IsCancellationRequested)
                            {
                                return(null);
                            }
                        }
                    }
                }
            }
            if (type == MatcherType.LengthHash)
            {
                return(ComparisonUtils.removeOneElementGroups(hashToFiles));
            }

            // filter out one element hash groups, regroup remaining by content
            var idToFiles = new MultiValueDictionary <string, FileInfo>();

            // we use this map to be able to only compare with content groups from the same hash groups
            // (we only need to compare all files with each other inside a hash group, so we store the hierarchy)
            var hashToIds = new MultiValueDictionary <string, string>();

            int matchCount = 0;

            foreach (KeyValuePair <string, HashSet <FileInfo> > entry in hashToFiles)
            {
                if (entry.Value.Count > 1)
                {
                    foreach (FileInfo fileA in entry.Value)
                    {
                        bool foundAGroup = false;

                        if (hashToIds.ContainsKey(entry.Key))
                        {
                            foreach (string id in hashToIds[entry.Key])
                            {
                                foreach (FileInfo fileB in idToFiles[id])
                                {
                                    ContentComparisonResult comparisonResult = HashContentComparison.compareFilesByContent(fileA, fileB, cancel);
                                    if (cancel.IsCancellationRequested)
                                    {
                                        return(null);
                                    }
                                    if (comparisonResult.result)
                                    {
                                        // even though we are iterating result, we know that we are only adding an item to the hashset inside value, not a key or value directly
                                        // thats why we should be able to modify result regardles
                                        idToFiles.Add(id, fileA);
                                        foundAGroup = true;
                                    }
                                    break;
                                }
                                if (foundAGroup)
                                {
                                    break;
                                }
                            }
                        }

                        // only if we didnt find an existing group, we happily create our own group, so that later files from the same hash can join our group
                        if (!foundAGroup)
                        {
                            string key = "Id: " + matchCount++;
                            idToFiles.Add(key, fileA);
                            hashToIds.Add(entry.Key, key);
                        }
                    }
                    if (cancel.IsCancellationRequested)
                    {
                        return(null);
                    }
                }
            }

            // let's just hope there are no bugs in here, not time for testing colliding hashes and dictionary juggling
            return(ComparisonUtils.removeOneElementGroups(idToFiles));
        }