예제 #1
0
        public void AddLink(CrawlLink crawlLink)
        {
            if (crawlLink.Relationship != null && crawlLink.Relationship.Equals("nofollow", StringComparison.InvariantCultureIgnoreCase))
            {
                return;
            }

            var uriWithoutFragment = StripFragment(crawlLink.Location);

            if (SeenUris.ContainsKey(uriWithoutFragment))
            {
                return;
            }

            AddRequest(uriWithoutFragment, false);
        }
예제 #2
0
        private void AddRequest(Uri requestUri, bool skipMaxPageCheck)
        {
            if (Settings.HostAliases != null)
            {
                if (!(requestUri.Host == BaseUri.Host || Settings.HostAliases.Contains(requestUri.Host)))
                {
                    Logger?.LogDebug($"{requestUri.Host} is not in the list of allowed hosts.");
                    return;
                }
            }
            else if (requestUri.Host != BaseUri.Host)
            {
                Logger?.LogDebug($"{requestUri.Host} doesn't match the base host.");
                return;
            }

            if (!skipMaxPageCheck && Settings.MaxNumberOfPagesToCrawl > 0)
            {
                var expectedCrawlCount = CrawledUris.Count + Settings.RequestProcessor.PendingRequests;
                if (expectedCrawlCount == Settings.MaxNumberOfPagesToCrawl)
                {
                    Logger?.LogDebug($"Page crawl limit blocks adding request for {requestUri}");
                    return;
                }
            }

            SeenUris.TryAdd(requestUri, 0);

            if (UriCrawlStates.TryGetValue(requestUri, out var crawlState))
            {
                var lastRequest = crawlState.Requests.LastOrDefault();
                if (lastRequest != null && lastRequest.IsSuccessfulStatus)
                {
                    return;
                }

                if (crawlState.Requests.Count() == Settings.NumberOfRetries)
                {
                    AddResult(new CrawledUri
                    {
                        Location      = crawlState.Location,
                        Status        = CrawlStatus.MaxRetries,
                        Requests      = crawlState.Requests,
                        RedirectChain = crawlState.Redirects
                    });
                    return;
                }

                if (crawlState.Redirects != null && crawlState.Redirects.Count == Settings.MaxNumberOfRedirects)
                {
                    AddResult(new CrawledUri
                    {
                        Location      = crawlState.Location,
                        RedirectChain = crawlState.Redirects,
                        Status        = CrawlStatus.MaxRedirects
                    });
                    return;
                }
            }

            if (RobotsFile.IsAllowedAccess(requestUri, Settings.UserAgent))
            {
                Settings.RequestProcessor.Add(requestUri);
            }
            else
            {
                AddResult(new CrawledUri
                {
                    Location = requestUri,
                    Status   = CrawlStatus.RobotsBlocked
                });
            }
        }