public static MultiValueDictionary <String, FileInfo> matchByLength(List <FileInfo> candidates) { MultiValueDictionary <String, FileInfo> checkedFiles = new MultiValueDictionary <string, FileInfo>(); foreach (FileInfo file in candidates) { string key = "Length: " + file.Length; checkedFiles.Add(key, file); } return(ComparisonUtils.removeOneElementGroups(checkedFiles)); }
private static MultiValueDictionary <string, FileInfo> matchConstraints(MultiValueDictionary <string, FileInfo> groups, MatcherConstraint constraint) { string constraintDescription; Func <FileInfo, string> getConstraintInfo; switch (constraint) { case MatcherConstraint.SameName: constraintDescription = "Filename"; getConstraintInfo = info => info.Name; break; case MatcherConstraint.SameCreationDate: constraintDescription = "Creation Date"; getConstraintInfo = info => info.CreationTime.ToString(); break; case MatcherConstraint.SameModifiedDate: constraintDescription = "Modfied Date"; getConstraintInfo = info => info.LastWriteTime.ToString(); break; default: throw new ArgumentException("Invalid MatcherConstraint"); } var regrouped = new MultiValueDictionary <string, FileInfo>();; foreach (KeyValuePair <string, HashSet <FileInfo> > entry in groups) { foreach (FileInfo info in entry.Value) { string constraintValue = getConstraintInfo.Invoke(info); string constraintKey = constraintDescription + ": " + constraintValue; string newkey = entry.Key + ", " + constraintKey; regrouped.Add(newkey, info); } } return(ComparisonUtils.removeOneElementGroups(regrouped)); }
private static MultiValueDictionary <string, FileInfo> matchIntoGroups(SearchOptions options, List <FileInfo> candidates, CancellationToken cancel) { switch (options.matcherType) { case MatcherType.Always: { var singleGroup = new MultiValueDictionary <string, FileInfo>(); foreach (FileInfo file in candidates) { singleGroup.Add("Any", file); } return(ComparisonUtils.removeOneElementGroups(singleGroup)); } case MatcherType.Length: { return(LengthMatcher.matchByLength(candidates)); } case MatcherType.LengthHash: { return(HashContentMatcher.matchByHashOrContent(candidates, options.matcherType, cancel)); } case MatcherType.LengthHashContent: { return(HashContentMatcher.matchByHashOrContent(candidates, options.matcherType, cancel)); } case MatcherType.Similarity: { return(SimilarityMatcher.matchBySimilarity(candidates, options.similarityMatcherThreshold, cancel)); } default: throw new ArgumentException("Invalid MatcherType!"); } }
public static MultiValueDictionary <string, FileInfo> matchByHashOrContent(List <FileInfo> candidates, MatcherType type, CancellationToken cancel) { if (!(type == MatcherType.LengthHash || type == MatcherType.LengthHashContent)) { throw new ArgumentException("Can only match by lengthHash or lengthHashContent!"); } // group by length var lengthToFiles = new MultiValueDictionary <long, FileInfo>(); foreach (FileInfo file in candidates) { lengthToFiles.Add(file.Length, file); } // filter out one element length groups, regroup remaining by hash var hashToFiles = new MultiValueDictionary <string, FileInfo>(); using (var md5 = MD5.Create()) { foreach (KeyValuePair <long, HashSet <FileInfo> > entry in lengthToFiles) { if (entry.Value.Count > 1) { foreach (FileInfo file in entry.Value) { using (var stream = File.OpenRead(file.FullName)) { byte[] hashBytes = md5.ComputeHash(stream); string hash = md5ToString(hashBytes); if (type == MatcherType.LengthHash) { // in case of LengthHash, we make hash key nice for display hashToFiles.Add("Hash: " + hash, file); } else { // in case of LengthHashContent, we don't need to becaue they are intermediate hashToFiles.Add(hash, file); } } if (cancel.IsCancellationRequested) { return(null); } } } } } if (type == MatcherType.LengthHash) { return(ComparisonUtils.removeOneElementGroups(hashToFiles)); } // filter out one element hash groups, regroup remaining by content var idToFiles = new MultiValueDictionary <string, FileInfo>(); // we use this map to be able to only compare with content groups from the same hash groups // (we only need to compare all files with each other inside a hash group, so we store the hierarchy) var hashToIds = new MultiValueDictionary <string, string>(); int matchCount = 0; foreach (KeyValuePair <string, HashSet <FileInfo> > entry in hashToFiles) { if (entry.Value.Count > 1) { foreach (FileInfo fileA in entry.Value) { bool foundAGroup = false; if (hashToIds.ContainsKey(entry.Key)) { foreach (string id in hashToIds[entry.Key]) { foreach (FileInfo fileB in idToFiles[id]) { ContentComparisonResult comparisonResult = HashContentComparison.compareFilesByContent(fileA, fileB, cancel); if (cancel.IsCancellationRequested) { return(null); } if (comparisonResult.result) { // even though we are iterating result, we know that we are only adding an item to the hashset inside value, not a key or value directly // thats why we should be able to modify result regardles idToFiles.Add(id, fileA); foundAGroup = true; } break; } if (foundAGroup) { break; } } } // only if we didnt find an existing group, we happily create our own group, so that later files from the same hash can join our group if (!foundAGroup) { string key = "Id: " + matchCount++; idToFiles.Add(key, fileA); hashToIds.Add(entry.Key, key); } } if (cancel.IsCancellationRequested) { return(null); } } } // let's just hope there are no bugs in here, not time for testing colliding hashes and dictionary juggling return(ComparisonUtils.removeOneElementGroups(idToFiles)); }