public async Task <IEnumerable <CrawledUri> > ProcessAsync( Func <RequestResult, UriCrawlState, Task> responseAction, CancellationToken cancellationToken = default ) { await Settings.RequestProcessor.ProcessAsync( HttpClient, async (requestResult) => { var crawlState = UriCrawlStates.GetOrAdd(requestResult.RequestUri, new UriCrawlState { Location = requestResult.RequestUri }); if (requestResult.ResponseMessage == null) { //Retry failed requests crawlState.Requests.Add(new CrawlRequest { RequestStart = requestResult.RequestStart, ElapsedTime = requestResult.ElapsedTime }); AddRequest(requestResult.RequestUri); } else { await responseAction(requestResult, crawlState); } }, Settings.RequestProcessorOptions, cancellationToken ); return(CrawledUris.ToArray()); }
public async Task <IEnumerable <CrawledUri> > ProcessAsync( Func <RequestResult, UriCrawlState, Task> responseSuccessAction, CancellationToken cancellationToken = default ) { await Settings.RequestProcessor.ProcessAsync( HttpClient, async (requestResult) => { var crawlState = UriCrawlStates.GetOrAdd(requestResult.RequestUri, new UriCrawlState { Location = requestResult.RequestUri }); if (requestResult.Exception != null) { //Retry failed requests Logger?.LogDebug($"An exception occurred while requesting {crawlState.Location}. This URL will be added to the request queue to be attempted again later."); crawlState.Requests.Add(new CrawlRequest { RequestStart = requestResult.RequestStart, ElapsedTime = requestResult.ElapsedTime }); AddRequest(requestResult.RequestUri); } else { var crawlRequest = new CrawlRequest { RequestStart = requestResult.RequestStart, ElapsedTime = requestResult.ElapsedTime, StatusCode = requestResult.StatusCode, IsSuccessfulStatus = (int)requestResult.StatusCode is >= 200 and <= 299 }; crawlState.Requests.Add(crawlRequest); var redirectStatusCodes = new[] { HttpStatusCode.MovedPermanently, HttpStatusCode.Redirect, HttpStatusCode.TemporaryRedirect }; if (redirectStatusCodes.Contains(crawlRequest.StatusCode.Value)) { Logger?.LogDebug($"Result for {crawlState.Location} was a redirect ({requestResult.ResponseHeaders.Location}). This URL will be added to the request queue."); AddRedirect(crawlState.Location, requestResult.ResponseHeaders.Location); } else if (crawlRequest.IsSuccessfulStatus) { await responseSuccessAction(requestResult, crawlState); } else if ((int)crawlRequest.StatusCode >= 500 && (int)crawlRequest.StatusCode <= 599) { //On server errors, try to crawl the page again later Logger?.LogDebug($"Result for {crawlState.Location} was unexpected ({crawlRequest.StatusCode}). This URL will be added to the request queue to be attempted again later."); AddRequest(crawlState.Location); } else { //On any other error, just save what we have seen and move on //Consider the content of the request irrelevant Logger?.LogDebug($"Result for {crawlState.Location} was unexpected ({crawlRequest.StatusCode}). No further requests will be attempted."); AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.Crawled, RedirectChain = crawlState.Redirects, Requests = crawlState.Requests }); } } }, Settings.RequestProcessorOptions, cancellationToken ); Logger?.LogDebug($"Completed crawling {CrawledUris.Count} pages."); return(CrawledUris.ToArray()); }