public async Task <IEnumerable <CrawledUri> > ProcessAsync( Func <RequestResult, UriCrawlState, Task> responseAction, CancellationToken cancellationToken = default ) { await Settings.RequestProcessor.ProcessAsync( HttpClient, async (requestResult) => { var crawlState = UriCrawlStates.GetOrAdd(requestResult.RequestUri, new UriCrawlState { Location = requestResult.RequestUri }); if (requestResult.ResponseMessage == null) { //Retry failed requests crawlState.Requests.Add(new CrawlRequest { RequestStart = requestResult.RequestStart, ElapsedTime = requestResult.ElapsedTime }); AddRequest(requestResult.RequestUri); } else { await responseAction(requestResult, crawlState); } }, Settings.RequestProcessorOptions, cancellationToken ); return(CrawledUris.ToArray()); }
public void AddRedirect(Uri requestUri, Uri redirectUri) { if (UriCrawlStates.TryRemove(requestUri, out var crawlState)) { var absoluteRedirectUri = new Uri(requestUri, redirectUri); absoluteRedirectUri = StripFragment(absoluteRedirectUri); var redirectCrawlState = new UriCrawlState { Location = absoluteRedirectUri, Redirects = crawlState.Redirects ?? new List <CrawledUriRedirect>() }; redirectCrawlState.Redirects.Add(new CrawledUriRedirect { Location = crawlState.Location, Requests = crawlState.Requests }); UriCrawlStates.TryAdd(redirectCrawlState.Location, redirectCrawlState); AddRequest(redirectCrawlState.Location, true); } }
public void AddResult(Uri requestUri, CrawledContent content) { if (UriCrawlStates.TryGetValue(requestUri, out var crawlState)) { var robotsPageDefinition = RobotsPageParser.FromRules(content.PageRobotRules); if (!robotsPageDefinition.CanIndex(Settings.UserAgent)) { Logger?.LogDebug($"Result content for {requestUri} has been blocked by an in-page Robots rule."); AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.RobotsBlocked, Requests = crawlState.Requests, RedirectChain = crawlState.Redirects }); } else { Logger?.LogDebug($"Result for {requestUri} has completed successfully with content."); AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.Crawled, RedirectChain = crawlState.Redirects, Requests = crawlState.Requests, Content = content }); if (robotsPageDefinition.CanFollowLinks(Settings.UserAgent)) { foreach (var crawlLink in content.Links) { AddLink(crawlLink); } } } } }
public void AddResult(Uri requestUri, CrawledContent content) { if (UriCrawlStates.TryGetValue(requestUri, out var crawlState)) { if (content != null) { var robotsPageDefinition = RobotsPageParser.FromRules(content.PageRobotRules); if (!robotsPageDefinition.CanIndex(Settings.UserAgent)) { AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.RobotsBlocked, Requests = crawlState.Requests, RedirectChain = crawlState.Redirects }); return; } if (robotsPageDefinition.CanFollowLinks(Settings.UserAgent)) { foreach (var crawlLink in content.Links) { AddLink(crawlLink); } } } AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.Crawled, RedirectChain = crawlState.Redirects, Requests = crawlState.Requests, Content = content }); } }
private void AddRequest(Uri requestUri, bool skipMaxPageCheck) { if (Settings.HostAliases != null) { if (!(requestUri.Host == BaseUri.Host || Settings.HostAliases.Contains(requestUri.Host))) { Logger?.LogDebug($"{requestUri.Host} is not in the list of allowed hosts."); return; } } else if (requestUri.Host != BaseUri.Host) { Logger?.LogDebug($"{requestUri.Host} doesn't match the base host."); return; } if (!skipMaxPageCheck && Settings.MaxNumberOfPagesToCrawl > 0) { var expectedCrawlCount = CrawledUris.Count + Settings.RequestProcessor.PendingRequests; if (expectedCrawlCount == Settings.MaxNumberOfPagesToCrawl) { Logger?.LogDebug($"Page crawl limit blocks adding request for {requestUri}"); return; } } SeenUris.TryAdd(requestUri, 0); if (UriCrawlStates.TryGetValue(requestUri, out var crawlState)) { var lastRequest = crawlState.Requests.LastOrDefault(); if (lastRequest != null && lastRequest.IsSuccessfulStatus) { return; } if (crawlState.Requests.Count() == Settings.NumberOfRetries) { AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.MaxRetries, Requests = crawlState.Requests, RedirectChain = crawlState.Redirects }); return; } if (crawlState.Redirects != null && crawlState.Redirects.Count == Settings.MaxNumberOfRedirects) { AddResult(new CrawledUri { Location = crawlState.Location, RedirectChain = crawlState.Redirects, Status = CrawlStatus.MaxRedirects }); return; } } if (RobotsFile.IsAllowedAccess(requestUri, Settings.UserAgent)) { Settings.RequestProcessor.Add(requestUri); } else { AddResult(new CrawledUri { Location = requestUri, Status = CrawlStatus.RobotsBlocked }); } }
public async Task <IEnumerable <CrawledUri> > ProcessAsync( Func <RequestResult, UriCrawlState, Task> responseSuccessAction, CancellationToken cancellationToken = default ) { await Settings.RequestProcessor.ProcessAsync( HttpClient, async (requestResult) => { var crawlState = UriCrawlStates.GetOrAdd(requestResult.RequestUri, new UriCrawlState { Location = requestResult.RequestUri }); if (requestResult.Exception != null) { //Retry failed requests Logger?.LogDebug($"An exception occurred while requesting {crawlState.Location}. This URL will be added to the request queue to be attempted again later."); crawlState.Requests.Add(new CrawlRequest { RequestStart = requestResult.RequestStart, ElapsedTime = requestResult.ElapsedTime }); AddRequest(requestResult.RequestUri); } else { var crawlRequest = new CrawlRequest { RequestStart = requestResult.RequestStart, ElapsedTime = requestResult.ElapsedTime, StatusCode = requestResult.StatusCode, IsSuccessfulStatus = (int)requestResult.StatusCode is >= 200 and <= 299 }; crawlState.Requests.Add(crawlRequest); var redirectStatusCodes = new[] { HttpStatusCode.MovedPermanently, HttpStatusCode.Redirect, HttpStatusCode.TemporaryRedirect }; if (redirectStatusCodes.Contains(crawlRequest.StatusCode.Value)) { Logger?.LogDebug($"Result for {crawlState.Location} was a redirect ({requestResult.ResponseHeaders.Location}). This URL will be added to the request queue."); AddRedirect(crawlState.Location, requestResult.ResponseHeaders.Location); } else if (crawlRequest.IsSuccessfulStatus) { await responseSuccessAction(requestResult, crawlState); } else if ((int)crawlRequest.StatusCode >= 500 && (int)crawlRequest.StatusCode <= 599) { //On server errors, try to crawl the page again later Logger?.LogDebug($"Result for {crawlState.Location} was unexpected ({crawlRequest.StatusCode}). This URL will be added to the request queue to be attempted again later."); AddRequest(crawlState.Location); } else { //On any other error, just save what we have seen and move on //Consider the content of the request irrelevant Logger?.LogDebug($"Result for {crawlState.Location} was unexpected ({crawlRequest.StatusCode}). No further requests will be attempted."); AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.Crawled, RedirectChain = crawlState.Redirects, Requests = crawlState.Requests }); } } }, Settings.RequestProcessorOptions, cancellationToken ); Logger?.LogDebug($"Completed crawling {CrawledUris.Count} pages."); return(CrawledUris.ToArray()); }