Exemple #1
0
        protected async Task <CrawledContent> RequestAndProcessContentAsync(SiteContext siteContext, Uri requestUri, IContentProcessor contentProcessor)
        {
            var httpClient = TestSiteConfiguration.GetHttpClient(siteContext);

            using (var response = await httpClient.GetAsync(requestUri))
            {
                await response.Content.LoadIntoBufferAsync();

                using (var contentStream = await response.Content.ReadAsStreamAsync())
                {
                    var headers = new CrawlHeaders(response.Headers, response.Content.Headers);
                    return(contentProcessor.Parse(requestUri, headers, contentStream));
                }
            }
        }
Exemple #2
0
        public async Task <CrawlResult> Crawl(Uri siteUri, CrawlSettings settings)
        {
            var result = new CrawlResult
            {
                CrawlStart = DateTime.UtcNow
            };
            var overallCrawlStopwatch = new Stopwatch();

            overallCrawlStopwatch.Start();

            var baseUri    = new Uri(siteUri.GetLeftPart(UriPartial.Authority));
            var robotsFile = await new RobotsFileParser(HttpClient).FromUriAsync(baseUri);

            UpdateCrawlDelay(robotsFile, settings.UserAgent, settings.RequestProcessorOptions);

            var crawlRunner = new CrawlRunner(baseUri, robotsFile, HttpClient, settings, Logger);

            //Use any links referred to by the sitemap as a starting point
            var urisFromSitemap = (await new SitemapQuery(HttpClient)
                                   .GetAllSitemapsForDomainAsync(siteUri.Host))
                                  .SelectMany(s => s.Urls.Select(u => u.Location).Distinct());

            foreach (var uri in urisFromSitemap)
            {
                crawlRunner.AddRequest(uri);
            }

            result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) =>
            {
                using (requestResult.Content)
                {
                    var headers = new CrawlHeaders(requestResult.ResponseHeaders, requestResult.ContentHeaders);
                    var content = settings.ContentProcessor.Parse(crawlState.Location, headers, requestResult.Content);
                    requestResult.Content.Seek(0, SeekOrigin.Begin);
                    content.RawContent = await new StreamReader(requestResult.Content).ReadToEndAsync();
                    crawlRunner.AddResult(crawlState.Location, content);
                }
            });

            overallCrawlStopwatch.Stop();
            result.ElapsedTime = overallCrawlStopwatch.Elapsed;
            return(result);
        }
Exemple #3
0
        public async Task <CrawlResult> Crawl(Uri siteUri, CrawlSettings settings)
        {
            var result = new CrawlResult
            {
                CrawlStart = DateTime.UtcNow
            };
            var overallCrawlStopwatch = new Stopwatch();

            overallCrawlStopwatch.Start();

            var baseUri    = new Uri(siteUri.GetLeftPart(UriPartial.Authority));
            var robotsFile = await new RobotsFileParser(HttpClient).FromUriAsync(baseUri);

            UpdateCrawlDelay(robotsFile, settings.UserAgent, settings.RequestProcessorOptions);

            var crawlRunner = new CrawlRunner(baseUri, robotsFile, HttpClient, settings, Logger);

            //Use any links referred to by the sitemap as a starting point
            var urisFromSitemap = (await new SitemapQuery(HttpClient)
                                   .GetAllSitemapsForDomain(siteUri.Host))
                                  .SelectMany(s => s.Urls.Select(u => u.Location).Distinct());

            foreach (var uri in urisFromSitemap)
            {
                crawlRunner.AddRequest(uri);
            }

            result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) =>
            {
                var response = requestResult.ResponseMessage;

                var crawlRequest = new CrawlRequest
                {
                    RequestStart       = requestResult.RequestStart,
                    ElapsedTime        = requestResult.ElapsedTime,
                    StatusCode         = response.StatusCode,
                    IsSuccessfulStatus = response.IsSuccessStatusCode
                };
                crawlState.Requests.Add(crawlRequest);

                var redirectStatusCodes = new[]
                {
                    HttpStatusCode.MovedPermanently,
                    HttpStatusCode.Redirect,
                    HttpStatusCode.TemporaryRedirect
                };
                if (redirectStatusCodes.Contains(crawlRequest.StatusCode.Value))
                {
                    crawlRunner.AddRedirect(crawlState.Location, response.Headers.Location);
                }
                else if (crawlRequest.IsSuccessfulStatus)
                {
                    using (var contentStream = await response.Content.ReadAsStreamAsync())
                    {
                        var headers = new CrawlHeaders(response.Headers, response.Content.Headers);
                        var content = settings.ContentProcessor.Parse(crawlState.Location, headers, contentStream);
                        contentStream.Seek(0, SeekOrigin.Begin);
                        content.RawContent = await new StreamReader(contentStream).ReadToEndAsync();
                        crawlRunner.AddResult(crawlState.Location, content);
                    }
                }
                else if ((int)crawlRequest.StatusCode >= 500 && (int)crawlRequest.StatusCode <= 599)
                {
                    //On server errors, try to crawl the page again later
                    crawlRunner.AddRequest(crawlState.Location);
                }
                else
                {
                    //On any other error, just save what we have seen and move on
                    //Consider the content of the request irrelevant
                    crawlRunner.AddResult(crawlState.Location, null);
                }
            });

            overallCrawlStopwatch.Stop();
            result.ElapsedTime = overallCrawlStopwatch.Elapsed;
            return(result);
        }