public async Task <PageRankingProcessorResponse> PageRankingProcessorHandler(PageRankingProcessorRequest request) { string targetPath = PathGenerator.Generate(request.SearchEngine, request.Keywords, request.TargetDate); StorageClientListFolderResponse files = await _storageClient.ListFolder(new StorageClientListFolderRequest { FolderPath = targetPath }); List <Task <StorageClientDownloadResponse> > downloadTasks = files.FilePaths .Select(f => _storageClient.Download(new StorageClientDownloadRequest { DownloadPath = f })) .ToList(); List <PageRankingProcessorWithSequenceResponse> processedResponses = new List <PageRankingProcessorWithSequenceResponse>(); while (downloadTasks.Any()) { Task <StorageClientDownloadResponse> finishedTask = await Task.WhenAny(downloadTasks); downloadTasks.Remove(finishedTask); StorageClientDownloadResponse downloadedData = await finishedTask; processedResponses.Add(await GenerateResponse(request.SearchEngine, downloadedData.FilePath, downloadedData.FileData)); } //combine List <PageRank> pageRanks = new List <PageRank>(); int rank = 1; foreach (var r in processedResponses.OrderBy(c => c.Sequence)) { foreach (var p in r.Pages.OrderBy(c => c.Rank)) { pageRanks.Add(new PageRank { Rank = rank++, Url = p.Url }); } } return(new PageRankingProcessorResponse { Date = request.TargetDate, Pages = pageRanks }); }
public async Task <int?> Resolve(string keywords, DateTime targetDate) { //fetch files from S3 and check for the next sequence to fetch string targetPath = PathGenerator.Generate(SearchEngine.Google, keywords, targetDate); StorageClientListFolderResponse files = await _storageClient.ListFolder(new StorageClientListFolderRequest { FolderPath = targetPath }); //expected format: sympli/SearchRankings/Google/e-settlements/2021-04-24/1.html // sympli/SearchRankings/Google/e-settlements/2021-04-24/2.html // ... if (files == null || files.FilePaths == null || files.FilePaths.Count == 0) { return(1); } //duplicate! TODO: move and refractor Regex regex = new Regex("(?i)/(\\d+).htm", RegexOptions.Singleline); List <int> sequencesFromFilePaths = files.FilePaths.Select(filePath => { var matches = regex.Matches(filePath); if (matches.Any()) { bool numberFound = int.TryParse(matches.First().Groups[1].Value, out int sequence); return(numberFound ? sequence : 0); } return(0); }).ToList(); if (sequencesFromFilePaths.Max() + 1 <= MaxSequence) { return(sequencesFromFilePaths.Max() + 1); } else { return(null); //max sequence exceeded, return NULL to terminate process. } }