public List <AnalysedDirectoryPair> AnalyseDirectories(IndexFile.Index index, int maxResults) { var groupedFiles = index.IndexedFiles .Select(ix => ix.Value) .GroupBy(f => f.FileId) .ToList(); Console.WriteLine($"Found {groupedFiles.Count} Files with duplicates"); var directoryPairs = groupedFiles .Where(g => g.Count() < 100) .SelectMany(g => GetDirectoryPairs(g)) .ToList(); Console.WriteLine($"Found {groupedFiles.Count} Files with duplicates"); var dedupedDirectoryPairs = directoryPairs .GroupBy(dp => dp.UniqueKey) .Select(g => g.First()) .ToList(); var directoriesToAnalyse = dedupedDirectoryPairs .Select(dp => AnalyseDirectoryPair(index, dp)) .OrderByDescending(dp => dp.PotentialSaving) .Take(maxResults) .ToList(); return(directoriesToAnalyse); }
private AnalysedDirectoryPair AnalyseDirectoryPair(IndexFile.Index index, DirectoryPair directoryPair) { var d1Files = index.DirectoryFiles[directoryPair.Directory1]; var d2Files = index.DirectoryFiles[directoryPair.Directory2]; var d1FileIds = d1Files.Select(f => f.FileId).ToList(); var d2FileIds = d2Files.Select(f => f.FileId).ToList(); var d1OnlyFiles = d1Files.Where(f => !d2FileIds.Contains(f.FileId)).ToList(); var d2OnlyFiles = d2Files.Where(f => !d1FileIds.Contains(f.FileId)).ToList(); var commonFiles = d1Files.Union(d2Files) .GroupBy(f => f.FileId) .Where(g => g.Count() > 1) .ToDictionary(g => g.Key, g => g.ToList()); var totalCommonFileSize = commonFiles.Sum(g => g.Value.Sum(f => f.Size)); var uniqueCommonFileSize = commonFiles.Sum(g => g.Value.First().Size); return(new AnalysedDirectoryPair { Directory1 = directoryPair.Directory1, Directory2 = directoryPair.Directory2, Directory1OnlyFiles = d1OnlyFiles, Directory2OnlyFiles = d2OnlyFiles, CommonFiles = commonFiles, Directory1OnlyFilesSize = d1OnlyFiles.Sum(f => f.Size), Directory2OnlyFilesSize = d2OnlyFiles.Sum(f => f.Size), CommonFilesSize = totalCommonFileSize, PotentialSaving = totalCommonFileSize - uniqueCommonFileSize }); }
/* * public static void Calculate(string directoryPath, string outputFilename, Dictionary<string, IndexedFile> existingEtags) * { * var indexedFiles = IndexNewFiles(directoryPath, existingEtags); * IndexWriter.WriteIndex(outputFilename, indexedFiles); * } */ public async Task <IEnumerable <IndexedFile> > IndexNewFilesAsync(ISource source, IndexFile.Index index) { if (!(source is FileSystemSource)) { throw new Exception("source should be an S3Source"); } var fileSystemSource = source as FileSystemSource; var directoryPath = fileSystemSource.Path; var existingEtags = index.IndexedFiles; var files = GetRecursiveFiles(directoryPath); var result = new List <IndexedFile>(); var verify = false; foreach (var file in files) { try { var filename = file.FullName;//.Substring(directoryPath.Length + 1); if (existingEtags.TryGetValue(filename, out var s3File)) { if (verify) { var md5 = await CalculateETag(file); var matches = md5.Equals(s3File.Etag); var matchText = matches ? $"OK - {md5}" : $"FAIL - S3 {s3File.Etag} - Local {md5}"; Console.WriteLine($"Verifying {filename} - {matchText}"); } } else { var invalidCharacters = new char[] { '\r', '\n' }; if (invalidCharacters.Any(c => filename.Contains(c))) { _logger.Warn($"Skipping file, Filename contains an invalid character: [{filename}]"); } else { var etag = await CalculateETag(file); result.Add(new IndexedFile(filename, etag, file.Length)); } } } catch (Exception ex) { _logger.Error($"Exception indexing \"{file.FullName}\" {ex.Message}"); } } return(result); }
public static IEnumerable <DuplicatesWithinDirectory> FindDuplicatesWithinDirectories(IndexFile.Index index, int maxResults) { var files = index.IndexedFiles.Values; var groupedByEtagAndDirectory = files .GroupBy(f => $"{f.FileId}-{f.Directory}") .Where(g => g.Count() > 1) .Select(MapGrouping) .OrderByDescending(g => g.PotentialSaving) .Take(maxResults); return(groupedByEtagAndDirectory); }
public async Task <IEnumerable <IndexedFile> > IndexNewFilesAsync(ISource source, IndexFile.Index index) { if (!(source is S3Source)) { throw new Exception("source should be an S3Source"); } var s3Source = source as S3Source; // Create a client AmazonS3Client client = new AmazonS3Client( new BasicAWSCredentials(s3Source.AccessKey, s3Source.SecretKey)); var results = new List <IndexedFile>(); string continuationToken = null; do { var listresponse = await client.ListObjectsV2Async( new ListObjectsV2Request() { BucketName = s3Source.BucketName, ContinuationToken = continuationToken }); foreach (var s3object in listresponse.S3Objects) { if (!index.IndexedFiles.ContainsKey(s3object.Key)) { results.Add(new IndexedFile(s3object.Key, s3object.ETag, s3object.Size)); } } continuationToken = listresponse.NextContinuationToken; } while (continuationToken != null); return(results); }