public void AddLink(CrawlLink crawlLink) { if (crawlLink.Relationship != null && crawlLink.Relationship.Equals("nofollow", StringComparison.InvariantCultureIgnoreCase)) { return; } var uriWithoutFragment = StripFragment(crawlLink.Location); if (SeenUris.ContainsKey(uriWithoutFragment)) { return; } AddRequest(uriWithoutFragment, false); }
private void AddRequest(Uri requestUri, bool skipMaxPageCheck) { if (Settings.HostAliases != null) { if (!(requestUri.Host == BaseUri.Host || Settings.HostAliases.Contains(requestUri.Host))) { Logger?.LogDebug($"{requestUri.Host} is not in the list of allowed hosts."); return; } } else if (requestUri.Host != BaseUri.Host) { Logger?.LogDebug($"{requestUri.Host} doesn't match the base host."); return; } if (!skipMaxPageCheck && Settings.MaxNumberOfPagesToCrawl > 0) { var expectedCrawlCount = CrawledUris.Count + Settings.RequestProcessor.PendingRequests; if (expectedCrawlCount == Settings.MaxNumberOfPagesToCrawl) { Logger?.LogDebug($"Page crawl limit blocks adding request for {requestUri}"); return; } } SeenUris.TryAdd(requestUri, 0); if (UriCrawlStates.TryGetValue(requestUri, out var crawlState)) { var lastRequest = crawlState.Requests.LastOrDefault(); if (lastRequest != null && lastRequest.IsSuccessfulStatus) { return; } if (crawlState.Requests.Count() == Settings.NumberOfRetries) { AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.MaxRetries, Requests = crawlState.Requests, RedirectChain = crawlState.Redirects }); return; } if (crawlState.Redirects != null && crawlState.Redirects.Count == Settings.MaxNumberOfRedirects) { AddResult(new CrawledUri { Location = crawlState.Location, RedirectChain = crawlState.Redirects, Status = CrawlStatus.MaxRedirects }); return; } } if (RobotsFile.IsAllowedAccess(requestUri, Settings.UserAgent)) { Settings.RequestProcessor.Add(requestUri); } else { AddResult(new CrawledUri { Location = requestUri, Status = CrawlStatus.RobotsBlocked }); } }