public void AddResult(Uri requestUri, CrawledContent content) { if (UriCrawlStates.TryGetValue(requestUri, out var crawlState)) { var robotsPageDefinition = RobotsPageParser.FromRules(content.PageRobotRules); if (!robotsPageDefinition.CanIndex(Settings.UserAgent)) { Logger?.LogDebug($"Result content for {requestUri} has been blocked by an in-page Robots rule."); AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.RobotsBlocked, Requests = crawlState.Requests, RedirectChain = crawlState.Redirects }); } else { Logger?.LogDebug($"Result for {requestUri} has completed successfully with content."); AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.Crawled, RedirectChain = crawlState.Redirects, Requests = crawlState.Requests, Content = content }); if (robotsPageDefinition.CanFollowLinks(Settings.UserAgent)) { foreach (var crawlLink in content.Links) { AddLink(crawlLink); } } } } }
public void AddResult(Uri requestUri, CrawledContent content) { if (UriCrawlStates.TryGetValue(requestUri, out var crawlState)) { if (content != null) { var robotsPageDefinition = RobotsPageParser.FromRules(content.PageRobotRules); if (!robotsPageDefinition.CanIndex(Settings.UserAgent)) { AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.RobotsBlocked, Requests = crawlState.Requests, RedirectChain = crawlState.Redirects }); return; } if (robotsPageDefinition.CanFollowLinks(Settings.UserAgent)) { foreach (var crawlLink in content.Links) { AddLink(crawlLink); } } } AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.Crawled, RedirectChain = crawlState.Redirects, Requests = crawlState.Requests, Content = content }); } }
private void AddRequest(Uri requestUri, bool skipMaxPageCheck) { if (Settings.HostAliases != null) { if (!(requestUri.Host == BaseUri.Host || Settings.HostAliases.Contains(requestUri.Host))) { Logger?.LogDebug($"{requestUri.Host} is not in the list of allowed hosts."); return; } } else if (requestUri.Host != BaseUri.Host) { Logger?.LogDebug($"{requestUri.Host} doesn't match the base host."); return; } if (!skipMaxPageCheck && Settings.MaxNumberOfPagesToCrawl > 0) { var expectedCrawlCount = CrawledUris.Count + Settings.RequestProcessor.PendingRequests; if (expectedCrawlCount == Settings.MaxNumberOfPagesToCrawl) { Logger?.LogDebug($"Page crawl limit blocks adding request for {requestUri}"); return; } } SeenUris.TryAdd(requestUri, 0); if (UriCrawlStates.TryGetValue(requestUri, out var crawlState)) { var lastRequest = crawlState.Requests.LastOrDefault(); if (lastRequest != null && lastRequest.IsSuccessfulStatus) { return; } if (crawlState.Requests.Count() == Settings.NumberOfRetries) { AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.MaxRetries, Requests = crawlState.Requests, RedirectChain = crawlState.Redirects }); return; } if (crawlState.Redirects != null && crawlState.Redirects.Count == Settings.MaxNumberOfRedirects) { AddResult(new CrawledUri { Location = crawlState.Location, RedirectChain = crawlState.Redirects, Status = CrawlStatus.MaxRedirects }); return; } } if (RobotsFile.IsAllowedAccess(requestUri, Settings.UserAgent)) { Settings.RequestProcessor.Add(requestUri); } else { AddResult(new CrawledUri { Location = requestUri, Status = CrawlStatus.RobotsBlocked }); } }