Пример #1
0
        public async Task <CrawlResult> Crawl(Uri siteUri, CrawlSettings settings)
        {
            var result = new CrawlResult
            {
                CrawlStart = DateTime.UtcNow
            };
            var overallCrawlStopwatch = new Stopwatch();

            overallCrawlStopwatch.Start();

            var baseUri    = new Uri(siteUri.GetLeftPart(UriPartial.Authority));
            var robotsFile = await new RobotsFileParser(HttpClient).FromUriAsync(baseUri);

            UpdateCrawlDelay(robotsFile, settings.UserAgent, settings.RequestProcessorOptions);

            var crawlRunner = new CrawlRunner(baseUri, robotsFile, HttpClient, settings, Logger);

            //Use any links referred to by the sitemap as a starting point
            var urisFromSitemap = (await new SitemapQuery(HttpClient)
                                   .GetAllSitemapsForDomain(siteUri.Host))
                                  .SelectMany(s => s.Urls.Select(u => u.Location).Distinct());

            foreach (var uri in urisFromSitemap)
            {
                crawlRunner.AddRequest(uri);
            }

            result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) =>
            {
                var response = requestResult.ResponseMessage;

                var crawlRequest = new CrawlRequest
                {
                    RequestStart       = requestResult.RequestStart,
                    ElapsedTime        = requestResult.ElapsedTime,
                    StatusCode         = response.StatusCode,
                    IsSuccessfulStatus = response.IsSuccessStatusCode
                };
                crawlState.Requests.Add(crawlRequest);

                var redirectStatusCodes = new[]
                {
                    HttpStatusCode.MovedPermanently,
                    HttpStatusCode.Redirect,
                    HttpStatusCode.TemporaryRedirect
                };
                if (redirectStatusCodes.Contains(crawlRequest.StatusCode.Value))
                {
                    crawlRunner.AddRedirect(crawlState.Location, response.Headers.Location);
                }
                else if (crawlRequest.IsSuccessfulStatus)
                {
                    using (var contentStream = await response.Content.ReadAsStreamAsync())
                    {
                        var headers = new CrawlHeaders(response.Headers, response.Content.Headers);
                        var content = settings.ContentProcessor.Parse(crawlState.Location, headers, contentStream);
                        contentStream.Seek(0, SeekOrigin.Begin);
                        content.RawContent = await new StreamReader(contentStream).ReadToEndAsync();
                        crawlRunner.AddResult(crawlState.Location, content);
                    }
                }
                else if ((int)crawlRequest.StatusCode >= 500 && (int)crawlRequest.StatusCode <= 599)
                {
                    //On server errors, try to crawl the page again later
                    crawlRunner.AddRequest(crawlState.Location);
                }
                else
                {
                    //On any other error, just save what we have seen and move on
                    //Consider the content of the request irrelevant
                    crawlRunner.AddResult(crawlState.Location, null);
                }
            });

            overallCrawlStopwatch.Stop();
            result.ElapsedTime = overallCrawlStopwatch.Elapsed;
            return(result);
        }
Пример #2
0
        private async Task <CrawledUri> PerformRequest(UriCrawlState crawlState, ConcurrentQueue <UriCrawlState> pagesToCrawl, CrawlContext context)
        {
            var crawlRequest = new CrawlRequest
            {
                RequestStart = DateTime.UtcNow,
            };

            var stopwatch = new Stopwatch();

            stopwatch.Start();

            using (var response = await HttpClient.GetAsync(crawlState.Location))
            {
                crawlRequest.StatusCode         = response.StatusCode;
                crawlRequest.IsSuccessfulStatus = response.IsSuccessStatusCode;

                await response.Content.LoadIntoBufferAsync();

                stopwatch.Stop();
                crawlRequest.ElapsedTime = stopwatch.Elapsed;

                crawlState.Requests.Add(crawlRequest);

                var redirectStatusCodes = new[]
                {
                    HttpStatusCode.MovedPermanently,
                    HttpStatusCode.Redirect,
                    HttpStatusCode.TemporaryRedirect
                };

                if (redirectStatusCodes.Contains(crawlRequest.StatusCode))
                {
                    var headerLocation     = response.Headers.Location;
                    var redirectCrawlState = new UriCrawlState
                    {
                        Location  = new Uri(crawlState.Location, headerLocation.ToString()),
                        Redirects = crawlState.Redirects ?? new List <CrawledUriRedirect>()
                    };

                    redirectCrawlState.Redirects.Add(new CrawledUriRedirect
                    {
                        Location = crawlState.Location,
                        Requests = crawlState.Requests
                    });

                    pagesToCrawl.Enqueue(redirectCrawlState);
                    context.SeenUris.TryAdd(headerLocation, 0);
                    return(null);
                }
                else if (crawlRequest.IsSuccessfulStatus)
                {
                    return(new CrawledUri
                    {
                        Location = crawlState.Location,
                        Status = CrawlStatus.Crawled,
                        RedirectChain = crawlState.Redirects,
                        Requests = crawlState.Requests,
                        Content = await context.Settings.ContentParser.Parse(crawlState.Location, response, context.Settings)
                    });
                }
                else if ((int)crawlRequest.StatusCode >= 500 && (int)crawlRequest.StatusCode <= 599)
                {
                    //On server errors, try to crawl the page again later
                    pagesToCrawl.Enqueue(crawlState);
                    return(null);
                }
                else
                {
                    //On any other error, just save what we have seen and move on
                    //Consider the content of the request irrelevant
                    return(new CrawledUri
                    {
                        Location = crawlState.Location,
                        Status = CrawlStatus.Crawled,
                        RedirectChain = crawlState.Redirects,
                        Requests = crawlState.Requests
                    });
                }
            }
        }