Esempio n. 1
0
    public async Task <PageRankingProcessorResponse> PageRankingProcessorHandler(PageRankingProcessorRequest request)
    {
        string targetPath = PathGenerator.Generate(request.SearchEngine, request.Keywords, request.TargetDate);

        StorageClientListFolderResponse files = await _storageClient.ListFolder(new StorageClientListFolderRequest { FolderPath = targetPath });

        List <Task <StorageClientDownloadResponse> > downloadTasks = files.FilePaths
                                                                     .Select(f => _storageClient.Download(new StorageClientDownloadRequest {
            DownloadPath = f
        }))
                                                                     .ToList();

        List <PageRankingProcessorWithSequenceResponse> processedResponses = new List <PageRankingProcessorWithSequenceResponse>();

        while (downloadTasks.Any())
        {
            Task <StorageClientDownloadResponse> finishedTask = await Task.WhenAny(downloadTasks);

            downloadTasks.Remove(finishedTask);

            StorageClientDownloadResponse downloadedData = await finishedTask;
            processedResponses.Add(await GenerateResponse(request.SearchEngine, downloadedData.FilePath, downloadedData.FileData));
        }

        //combine
        List <PageRank> pageRanks = new List <PageRank>();
        int             rank      = 1;

        foreach (var r in processedResponses.OrderBy(c => c.Sequence))
        {
            foreach (var p in r.Pages.OrderBy(c => c.Rank))
            {
                pageRanks.Add(new PageRank {
                    Rank = rank++, Url = p.Url
                });
            }
        }

        return(new PageRankingProcessorResponse
        {
            Date = request.TargetDate,
            Pages = pageRanks
        });
    }
Esempio n. 2
0
        public async Task <int?> Resolve(string keywords, DateTime targetDate)
        {
            //fetch files from S3 and check for the next sequence to fetch
            string targetPath = PathGenerator.Generate(SearchEngine.Google, keywords, targetDate);

            StorageClientListFolderResponse files = await _storageClient.ListFolder(new StorageClientListFolderRequest { FolderPath = targetPath });

            //expected format:  sympli/SearchRankings/Google/e-settlements/2021-04-24/1.html
            //                  sympli/SearchRankings/Google/e-settlements/2021-04-24/2.html
            //                  ...
            if (files == null || files.FilePaths == null || files.FilePaths.Count == 0)
            {
                return(1);
            }

            //duplicate! TODO: move and refractor
            Regex regex = new Regex("(?i)/(\\d+).htm", RegexOptions.Singleline);

            List <int> sequencesFromFilePaths = files.FilePaths.Select(filePath =>
            {
                var matches = regex.Matches(filePath);
                if (matches.Any())
                {
                    bool numberFound = int.TryParse(matches.First().Groups[1].Value, out int sequence);
                    return(numberFound ? sequence : 0);
                }
                return(0);
            }).ToList();

            if (sequencesFromFilePaths.Max() + 1 <= MaxSequence)
            {
                return(sequencesFromFilePaths.Max() + 1);
            }
            else
            {
                return(null); //max sequence exceeded, return NULL to terminate process.
            }
        }