public List <AnalysedDirectoryPair> AnalyseDirectories(IndexFile.Index index, int maxResults)
        {
            var groupedFiles = index.IndexedFiles
                               .Select(ix => ix.Value)
                               .GroupBy(f => f.FileId)
                               .ToList();

            Console.WriteLine($"Found {groupedFiles.Count} Files with duplicates");

            var directoryPairs = groupedFiles
                                 .Where(g => g.Count() < 100)
                                 .SelectMany(g => GetDirectoryPairs(g))
                                 .ToList();

            Console.WriteLine($"Found {groupedFiles.Count} Files with duplicates");

            var dedupedDirectoryPairs = directoryPairs
                                        .GroupBy(dp => dp.UniqueKey)
                                        .Select(g => g.First())
                                        .ToList();

            var directoriesToAnalyse = dedupedDirectoryPairs
                                       .Select(dp => AnalyseDirectoryPair(index, dp))
                                       .OrderByDescending(dp => dp.PotentialSaving)
                                       .Take(maxResults)
                                       .ToList();

            return(directoriesToAnalyse);
        }
        private AnalysedDirectoryPair AnalyseDirectoryPair(IndexFile.Index index, DirectoryPair directoryPair)
        {
            var d1Files = index.DirectoryFiles[directoryPair.Directory1];
            var d2Files = index.DirectoryFiles[directoryPair.Directory2];


            var d1FileIds = d1Files.Select(f => f.FileId).ToList();
            var d2FileIds = d2Files.Select(f => f.FileId).ToList();

            var d1OnlyFiles = d1Files.Where(f => !d2FileIds.Contains(f.FileId)).ToList();
            var d2OnlyFiles = d2Files.Where(f => !d1FileIds.Contains(f.FileId)).ToList();

            var commonFiles = d1Files.Union(d2Files)
                              .GroupBy(f => f.FileId)
                              .Where(g => g.Count() > 1)
                              .ToDictionary(g => g.Key, g => g.ToList());

            var totalCommonFileSize  = commonFiles.Sum(g => g.Value.Sum(f => f.Size));
            var uniqueCommonFileSize = commonFiles.Sum(g => g.Value.First().Size);

            return(new AnalysedDirectoryPair
            {
                Directory1 = directoryPair.Directory1,
                Directory2 = directoryPair.Directory2,

                Directory1OnlyFiles = d1OnlyFiles,
                Directory2OnlyFiles = d2OnlyFiles,
                CommonFiles = commonFiles,

                Directory1OnlyFilesSize = d1OnlyFiles.Sum(f => f.Size),
                Directory2OnlyFilesSize = d2OnlyFiles.Sum(f => f.Size),
                CommonFilesSize = totalCommonFileSize,

                PotentialSaving = totalCommonFileSize - uniqueCommonFileSize
            });
        }
        /*
         * public static void Calculate(string directoryPath, string outputFilename, Dictionary<string, IndexedFile> existingEtags)
         * {
         *  var indexedFiles = IndexNewFiles(directoryPath, existingEtags);
         *  IndexWriter.WriteIndex(outputFilename, indexedFiles);
         * }
         */
        public async Task <IEnumerable <IndexedFile> > IndexNewFilesAsync(ISource source, IndexFile.Index index)
        {
            if (!(source is FileSystemSource))
            {
                throw new Exception("source should be an S3Source");
            }

            var fileSystemSource = source as FileSystemSource;

            var directoryPath = fileSystemSource.Path;
            var existingEtags = index.IndexedFiles;
            var files         = GetRecursiveFiles(directoryPath);

            var result = new List <IndexedFile>();
            var verify = false;

            foreach (var file in files)
            {
                try
                {
                    var filename = file.FullName;//.Substring(directoryPath.Length + 1);
                    if (existingEtags.TryGetValue(filename, out var s3File))
                    {
                        if (verify)
                        {
                            var md5 = await CalculateETag(file);

                            var matches   = md5.Equals(s3File.Etag);
                            var matchText = matches ?
                                            $"OK - {md5}" :
                                            $"FAIL - S3 {s3File.Etag} - Local {md5}";
                            Console.WriteLine($"Verifying {filename} - {matchText}");
                        }
                    }
                    else
                    {
                        var invalidCharacters = new char[]
                        {
                            '\r',
                            '\n'
                        };

                        if (invalidCharacters.Any(c => filename.Contains(c)))
                        {
                            _logger.Warn($"Skipping file, Filename contains an invalid character: [{filename}]");
                        }
                        else
                        {
                            var etag = await CalculateETag(file);

                            result.Add(new IndexedFile(filename, etag, file.Length));
                        }
                    }
                }
                catch (Exception ex)
                {
                    _logger.Error($"Exception indexing \"{file.FullName}\" {ex.Message}");
                }
            }

            return(result);
        }
示例#4
0
        public static IEnumerable <DuplicatesWithinDirectory> FindDuplicatesWithinDirectories(IndexFile.Index index, int maxResults)
        {
            var files = index.IndexedFiles.Values;

            var groupedByEtagAndDirectory = files
                                            .GroupBy(f => $"{f.FileId}-{f.Directory}")
                                            .Where(g => g.Count() > 1)
                                            .Select(MapGrouping)
                                            .OrderByDescending(g => g.PotentialSaving)
                                            .Take(maxResults);

            return(groupedByEtagAndDirectory);
        }
示例#5
0
        public async Task <IEnumerable <IndexedFile> > IndexNewFilesAsync(ISource source, IndexFile.Index index)
        {
            if (!(source is S3Source))
            {
                throw new Exception("source should be an S3Source");
            }

            var s3Source = source as S3Source;

            // Create a client
            AmazonS3Client client = new AmazonS3Client(
                new BasicAWSCredentials(s3Source.AccessKey, s3Source.SecretKey));

            var results = new List <IndexedFile>();

            string continuationToken = null;

            do
            {
                var listresponse = await client.ListObjectsV2Async(
                    new ListObjectsV2Request()
                {
                    BucketName        = s3Source.BucketName,
                    ContinuationToken = continuationToken
                });

                foreach (var s3object in listresponse.S3Objects)
                {
                    if (!index.IndexedFiles.ContainsKey(s3object.Key))
                    {
                        results.Add(new IndexedFile(s3object.Key, s3object.ETag, s3object.Size));
                    }
                }

                continuationToken = listresponse.NextContinuationToken;
            } while (continuationToken != null);

            return(results);
        }