示例#1
0
        public async Task <CrawlResult> Crawl(Uri siteUri, CrawlSettings settings)
        {
            var result = new CrawlResult
            {
                CrawlStart = DateTime.UtcNow
            };
            var overallCrawlStopwatch = new Stopwatch();

            overallCrawlStopwatch.Start();

            var baseUri    = new Uri(siteUri.GetLeftPart(UriPartial.Authority));
            var robotsFile = await new RobotsFileParser(HttpClient).FromUriAsync(baseUri);

            UpdateCrawlDelay(robotsFile, settings.UserAgent, settings.RequestProcessorOptions);

            var crawlRunner = new CrawlRunner(baseUri, robotsFile, HttpClient, settings, Logger);

            //Use any links referred to by the sitemap as a starting point
            var urisFromSitemap = (await new SitemapQuery(HttpClient)
                                   .GetAllSitemapsForDomainAsync(siteUri.Host))
                                  .SelectMany(s => s.Urls.Select(u => u.Location).Distinct());

            foreach (var uri in urisFromSitemap)
            {
                crawlRunner.AddRequest(uri);
            }

            result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) =>
            {
                using (requestResult.Content)
                {
                    var headers = new CrawlHeaders(requestResult.ResponseHeaders, requestResult.ContentHeaders);
                    var content = settings.ContentProcessor.Parse(crawlState.Location, headers, requestResult.Content);
                    requestResult.Content.Seek(0, SeekOrigin.Begin);
                    content.RawContent = await new StreamReader(requestResult.Content).ReadToEndAsync();
                    crawlRunner.AddResult(crawlState.Location, content);
                }
            });

            overallCrawlStopwatch.Stop();
            result.ElapsedTime = overallCrawlStopwatch.Elapsed;
            return(result);
        }
示例#2
0
        public async Task <CrawlResult> Crawl(Uri siteUri, CrawlSettings settings)
        {
            var result = new CrawlResult
            {
                CrawlStart = DateTime.UtcNow
            };
            var overallCrawlStopwatch = new Stopwatch();

            overallCrawlStopwatch.Start();

            var baseUri    = new Uri(siteUri.GetLeftPart(UriPartial.Authority));
            var robotsFile = await new RobotsFileParser(HttpClient).FromUriAsync(baseUri);

            UpdateCrawlDelay(robotsFile, settings.UserAgent, settings.RequestProcessorOptions);

            var crawlRunner = new CrawlRunner(baseUri, robotsFile, HttpClient, settings, Logger);

            //Use any links referred to by the sitemap as a starting point
            var urisFromSitemap = (await new SitemapQuery(HttpClient)
                                   .GetAllSitemapsForDomain(siteUri.Host))
                                  .SelectMany(s => s.Urls.Select(u => u.Location).Distinct());

            foreach (var uri in urisFromSitemap)
            {
                crawlRunner.AddRequest(uri);
            }

            result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) =>
            {
                var response = requestResult.ResponseMessage;

                var crawlRequest = new CrawlRequest
                {
                    RequestStart       = requestResult.RequestStart,
                    ElapsedTime        = requestResult.ElapsedTime,
                    StatusCode         = response.StatusCode,
                    IsSuccessfulStatus = response.IsSuccessStatusCode
                };
                crawlState.Requests.Add(crawlRequest);

                var redirectStatusCodes = new[]
                {
                    HttpStatusCode.MovedPermanently,
                    HttpStatusCode.Redirect,
                    HttpStatusCode.TemporaryRedirect
                };
                if (redirectStatusCodes.Contains(crawlRequest.StatusCode.Value))
                {
                    crawlRunner.AddRedirect(crawlState.Location, response.Headers.Location);
                }
                else if (crawlRequest.IsSuccessfulStatus)
                {
                    using (var contentStream = await response.Content.ReadAsStreamAsync())
                    {
                        var headers = new CrawlHeaders(response.Headers, response.Content.Headers);
                        var content = settings.ContentProcessor.Parse(crawlState.Location, headers, contentStream);
                        contentStream.Seek(0, SeekOrigin.Begin);
                        content.RawContent = await new StreamReader(contentStream).ReadToEndAsync();
                        crawlRunner.AddResult(crawlState.Location, content);
                    }
                }
                else if ((int)crawlRequest.StatusCode >= 500 && (int)crawlRequest.StatusCode <= 599)
                {
                    //On server errors, try to crawl the page again later
                    crawlRunner.AddRequest(crawlState.Location);
                }
                else
                {
                    //On any other error, just save what we have seen and move on
                    //Consider the content of the request irrelevant
                    crawlRunner.AddResult(crawlState.Location, null);
                }
            });

            overallCrawlStopwatch.Stop();
            result.ElapsedTime = overallCrawlStopwatch.Elapsed;
            return(result);
        }
示例#3
0
        public async Task <CrawlResult> Crawl(Uri siteUri, CrawlSettings settings)
        {
            var result = new CrawlResult
            {
                CrawlStart = DateTime.UtcNow
            };
            var stopwatch = new Stopwatch();

            stopwatch.Start();

            var baseUri    = new Uri(siteUri.GetLeftPart(UriPartial.Authority));
            var robotsFile = await new RobotsParser(HttpClient).FromUriAsync(baseUri);

            //Apply Robots.txt crawl-delay (if defined)
            var userAgentEntry    = robotsFile.GetEntryForUserAgent(settings.UserAgent);
            var minimumCrawlDelay = userAgentEntry?.CrawlDelay ?? 0;
            var taskDelay         = Math.Max(minimumCrawlDelay * 1000, settings.TaskHandlerOptions.DelayBetweenTaskStart.TotalMilliseconds);

            settings.TaskHandlerOptions.DelayBetweenTaskStart = new TimeSpan(0, 0, 0, 0, (int)taskDelay);

            var seedUris = new List <UriCrawlState>
            {
                new UriCrawlState {
                    Location = baseUri
                }
            };

            //Use any links referred to by the sitemap as a starting point
            seedUris.AddRange((await new SitemapQuery(HttpClient)
                               .GetAllSitemapsForDomain(siteUri.Host))
                              .SelectMany(s => s.Urls.Select(u => new UriCrawlState {
                Location = u.Location
            }))
                              .Distinct()
                              );

            var crawlContext = new CrawlContext
            {
                Settings = settings
            };

            await TaskHandler.For(seedUris.Distinct().ToArray(), async (crawlState, pagesToCrawl) =>
            {
                if (!CheckUriValidity(crawlState.Location, baseUri, crawlContext))
                {
                    return;
                }

                if (crawlContext.CrawledUris.ContainsKey(crawlState.Location))
                {
                    return;
                }

                crawlContext.SeenUris.TryAdd(crawlState.Location, 0);

                var lastRequest = crawlState.Requests.LastOrDefault();
                if (lastRequest != null && lastRequest.IsSuccessfulStatus)
                {
                    return;
                }
                else if (crawlState.Requests.Count() == settings.NumberOfRetries)
                {
                    crawlContext.CrawledUris.TryAdd(crawlState.Location, new CrawledUri
                    {
                        Location      = crawlState.Location,
                        Status        = CrawlStatus.MaxRetries,
                        Requests      = crawlState.Requests,
                        RedirectChain = crawlState.Redirects
                    });
                }
                else if (robotsFile.IsAllowedAccess(crawlState.Location, settings.UserAgent))
                {
                    var crawledUri = await PerformRequest(crawlState, pagesToCrawl, crawlContext);
                    if (crawledUri != null)
                    {
                        crawlContext.CrawledUris.TryAdd(crawlState.Location, crawledUri);

                        if (crawledUri.Content?.Links?.Any() == true)
                        {
                            foreach (var crawlLink in crawledUri.Content.Links)
                            {
                                if (CheckUriValidity(crawlLink.Location, baseUri, crawlContext))
                                {
                                    if (crawlContext.SeenUris.ContainsKey(crawlLink.Location))
                                    {
                                        continue;
                                    }

                                    crawlContext.SeenUris.TryAdd(crawlLink.Location, 0);
                                    pagesToCrawl.Enqueue(new UriCrawlState
                                    {
                                        Location = crawlLink.Location
                                    });
                                }
                            }
                        }
                    }
                }
                else
                {
                    crawlContext.CrawledUris.TryAdd(crawlState.Location, new CrawledUri
                    {
                        Location = crawlState.Location,
                        Status   = CrawlStatus.RobotsBlocked
                    });
                }
            }, settings.TaskHandlerOptions);

            stopwatch.Stop();
            result.ElapsedTime = stopwatch.Elapsed;
            result.CrawledUris = crawlContext.CrawledUris.Values;
            return(result);
        }