Esempio n. 1
0
        public async Task <IEnumerable <CrawledUri> > ProcessAsync(
            Func <RequestResult, UriCrawlState, Task> responseAction,
            CancellationToken cancellationToken = default
            )
        {
            await Settings.RequestProcessor.ProcessAsync(
                HttpClient,
                async (requestResult) =>
            {
                var crawlState = UriCrawlStates.GetOrAdd(requestResult.RequestUri, new UriCrawlState
                {
                    Location = requestResult.RequestUri
                });

                if (requestResult.ResponseMessage == null)
                {
                    //Retry failed requests
                    crawlState.Requests.Add(new CrawlRequest
                    {
                        RequestStart = requestResult.RequestStart,
                        ElapsedTime  = requestResult.ElapsedTime
                    });
                    AddRequest(requestResult.RequestUri);
                }
                else
                {
                    await responseAction(requestResult, crawlState);
                }
            },
                Settings.RequestProcessorOptions,
                cancellationToken
                );

            return(CrawledUris.ToArray());
        }
Esempio n. 2
0
        public void AddRedirect(Uri requestUri, Uri redirectUri)
        {
            if (UriCrawlStates.TryRemove(requestUri, out var crawlState))
            {
                var absoluteRedirectUri = new Uri(requestUri, redirectUri);
                absoluteRedirectUri = StripFragment(absoluteRedirectUri);

                var redirectCrawlState = new UriCrawlState
                {
                    Location  = absoluteRedirectUri,
                    Redirects = crawlState.Redirects ?? new List <CrawledUriRedirect>()
                };
                redirectCrawlState.Redirects.Add(new CrawledUriRedirect
                {
                    Location = crawlState.Location,
                    Requests = crawlState.Requests
                });

                UriCrawlStates.TryAdd(redirectCrawlState.Location, redirectCrawlState);
                AddRequest(redirectCrawlState.Location, true);
            }
        }
Esempio n. 3
0
        public void AddResult(Uri requestUri, CrawledContent content)
        {
            if (UriCrawlStates.TryGetValue(requestUri, out var crawlState))
            {
                var robotsPageDefinition = RobotsPageParser.FromRules(content.PageRobotRules);
                if (!robotsPageDefinition.CanIndex(Settings.UserAgent))
                {
                    Logger?.LogDebug($"Result content for {requestUri} has been blocked by an in-page Robots rule.");
                    AddResult(new CrawledUri
                    {
                        Location      = crawlState.Location,
                        Status        = CrawlStatus.RobotsBlocked,
                        Requests      = crawlState.Requests,
                        RedirectChain = crawlState.Redirects
                    });
                }
                else
                {
                    Logger?.LogDebug($"Result for {requestUri} has completed successfully with content.");

                    AddResult(new CrawledUri
                    {
                        Location      = crawlState.Location,
                        Status        = CrawlStatus.Crawled,
                        RedirectChain = crawlState.Redirects,
                        Requests      = crawlState.Requests,
                        Content       = content
                    });

                    if (robotsPageDefinition.CanFollowLinks(Settings.UserAgent))
                    {
                        foreach (var crawlLink in content.Links)
                        {
                            AddLink(crawlLink);
                        }
                    }
                }
            }
        }
Esempio n. 4
0
        public void AddResult(Uri requestUri, CrawledContent content)
        {
            if (UriCrawlStates.TryGetValue(requestUri, out var crawlState))
            {
                if (content != null)
                {
                    var robotsPageDefinition = RobotsPageParser.FromRules(content.PageRobotRules);
                    if (!robotsPageDefinition.CanIndex(Settings.UserAgent))
                    {
                        AddResult(new CrawledUri
                        {
                            Location      = crawlState.Location,
                            Status        = CrawlStatus.RobotsBlocked,
                            Requests      = crawlState.Requests,
                            RedirectChain = crawlState.Redirects
                        });
                        return;
                    }

                    if (robotsPageDefinition.CanFollowLinks(Settings.UserAgent))
                    {
                        foreach (var crawlLink in content.Links)
                        {
                            AddLink(crawlLink);
                        }
                    }
                }

                AddResult(new CrawledUri
                {
                    Location      = crawlState.Location,
                    Status        = CrawlStatus.Crawled,
                    RedirectChain = crawlState.Redirects,
                    Requests      = crawlState.Requests,
                    Content       = content
                });
            }
        }
Esempio n. 5
0
        private void AddRequest(Uri requestUri, bool skipMaxPageCheck)
        {
            if (Settings.HostAliases != null)
            {
                if (!(requestUri.Host == BaseUri.Host || Settings.HostAliases.Contains(requestUri.Host)))
                {
                    Logger?.LogDebug($"{requestUri.Host} is not in the list of allowed hosts.");
                    return;
                }
            }
            else if (requestUri.Host != BaseUri.Host)
            {
                Logger?.LogDebug($"{requestUri.Host} doesn't match the base host.");
                return;
            }

            if (!skipMaxPageCheck && Settings.MaxNumberOfPagesToCrawl > 0)
            {
                var expectedCrawlCount = CrawledUris.Count + Settings.RequestProcessor.PendingRequests;
                if (expectedCrawlCount == Settings.MaxNumberOfPagesToCrawl)
                {
                    Logger?.LogDebug($"Page crawl limit blocks adding request for {requestUri}");
                    return;
                }
            }

            SeenUris.TryAdd(requestUri, 0);

            if (UriCrawlStates.TryGetValue(requestUri, out var crawlState))
            {
                var lastRequest = crawlState.Requests.LastOrDefault();
                if (lastRequest != null && lastRequest.IsSuccessfulStatus)
                {
                    return;
                }

                if (crawlState.Requests.Count() == Settings.NumberOfRetries)
                {
                    AddResult(new CrawledUri
                    {
                        Location      = crawlState.Location,
                        Status        = CrawlStatus.MaxRetries,
                        Requests      = crawlState.Requests,
                        RedirectChain = crawlState.Redirects
                    });
                    return;
                }

                if (crawlState.Redirects != null && crawlState.Redirects.Count == Settings.MaxNumberOfRedirects)
                {
                    AddResult(new CrawledUri
                    {
                        Location      = crawlState.Location,
                        RedirectChain = crawlState.Redirects,
                        Status        = CrawlStatus.MaxRedirects
                    });
                    return;
                }
            }

            if (RobotsFile.IsAllowedAccess(requestUri, Settings.UserAgent))
            {
                Settings.RequestProcessor.Add(requestUri);
            }
            else
            {
                AddResult(new CrawledUri
                {
                    Location = requestUri,
                    Status   = CrawlStatus.RobotsBlocked
                });
            }
        }
Esempio n. 6
0
        public async Task <IEnumerable <CrawledUri> > ProcessAsync(
            Func <RequestResult, UriCrawlState, Task> responseSuccessAction,
            CancellationToken cancellationToken = default
            )
        {
            await Settings.RequestProcessor.ProcessAsync(
                HttpClient,
                async (requestResult) =>
            {
                var crawlState = UriCrawlStates.GetOrAdd(requestResult.RequestUri, new UriCrawlState
                {
                    Location = requestResult.RequestUri
                });

                if (requestResult.Exception != null)
                {
                    //Retry failed requests
                    Logger?.LogDebug($"An exception occurred while requesting {crawlState.Location}. This URL will be added to the request queue to be attempted again later.");
                    crawlState.Requests.Add(new CrawlRequest
                    {
                        RequestStart = requestResult.RequestStart,
                        ElapsedTime  = requestResult.ElapsedTime
                    });
                    AddRequest(requestResult.RequestUri);
                }
                else
                {
                    var crawlRequest = new CrawlRequest
                    {
                        RequestStart       = requestResult.RequestStart,
                        ElapsedTime        = requestResult.ElapsedTime,
                        StatusCode         = requestResult.StatusCode,
                        IsSuccessfulStatus = (int)requestResult.StatusCode is >= 200 and <= 299
                    };
                    crawlState.Requests.Add(crawlRequest);

                    var redirectStatusCodes = new[]
                    {
                        HttpStatusCode.MovedPermanently,
                        HttpStatusCode.Redirect,
                        HttpStatusCode.TemporaryRedirect
                    };
                    if (redirectStatusCodes.Contains(crawlRequest.StatusCode.Value))
                    {
                        Logger?.LogDebug($"Result for {crawlState.Location} was a redirect ({requestResult.ResponseHeaders.Location}). This URL will be added to the request queue.");
                        AddRedirect(crawlState.Location, requestResult.ResponseHeaders.Location);
                    }
                    else if (crawlRequest.IsSuccessfulStatus)
                    {
                        await responseSuccessAction(requestResult, crawlState);
                    }
                    else if ((int)crawlRequest.StatusCode >= 500 && (int)crawlRequest.StatusCode <= 599)
                    {
                        //On server errors, try to crawl the page again later
                        Logger?.LogDebug($"Result for {crawlState.Location} was unexpected ({crawlRequest.StatusCode}). This URL will be added to the request queue to be attempted again later.");
                        AddRequest(crawlState.Location);
                    }
                    else
                    {
                        //On any other error, just save what we have seen and move on
                        //Consider the content of the request irrelevant
                        Logger?.LogDebug($"Result for {crawlState.Location} was unexpected ({crawlRequest.StatusCode}). No further requests will be attempted.");
                        AddResult(new CrawledUri
                        {
                            Location      = crawlState.Location,
                            Status        = CrawlStatus.Crawled,
                            RedirectChain = crawlState.Redirects,
                            Requests      = crawlState.Requests
                        });
                    }
                }
            },
                Settings.RequestProcessorOptions,
                cancellationToken
                );

            Logger?.LogDebug($"Completed crawling {CrawledUris.Count} pages.");

            return(CrawledUris.ToArray());
        }