public async Task <CrawlResult> Crawl(Uri siteUri, CrawlSettings settings) { var result = new CrawlResult { CrawlStart = DateTime.UtcNow }; var overallCrawlStopwatch = new Stopwatch(); overallCrawlStopwatch.Start(); var baseUri = new Uri(siteUri.GetLeftPart(UriPartial.Authority)); var robotsFile = await new RobotsFileParser(HttpClient).FromUriAsync(baseUri); UpdateCrawlDelay(robotsFile, settings.UserAgent, settings.RequestProcessorOptions); var crawlRunner = new CrawlRunner(baseUri, robotsFile, HttpClient, settings, Logger); //Use any links referred to by the sitemap as a starting point var urisFromSitemap = (await new SitemapQuery(HttpClient) .GetAllSitemapsForDomainAsync(siteUri.Host)) .SelectMany(s => s.Urls.Select(u => u.Location).Distinct()); foreach (var uri in urisFromSitemap) { crawlRunner.AddRequest(uri); } result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) => { using (requestResult.Content) { var headers = new CrawlHeaders(requestResult.ResponseHeaders, requestResult.ContentHeaders); var content = settings.ContentProcessor.Parse(crawlState.Location, headers, requestResult.Content); requestResult.Content.Seek(0, SeekOrigin.Begin); content.RawContent = await new StreamReader(requestResult.Content).ReadToEndAsync(); crawlRunner.AddResult(crawlState.Location, content); } }); overallCrawlStopwatch.Stop(); result.ElapsedTime = overallCrawlStopwatch.Elapsed; return(result); }
public async Task <CrawlResult> Crawl(Uri siteUri, CrawlSettings settings) { var result = new CrawlResult { CrawlStart = DateTime.UtcNow }; var overallCrawlStopwatch = new Stopwatch(); overallCrawlStopwatch.Start(); var baseUri = new Uri(siteUri.GetLeftPart(UriPartial.Authority)); var robotsFile = await new RobotsFileParser(HttpClient).FromUriAsync(baseUri); UpdateCrawlDelay(robotsFile, settings.UserAgent, settings.RequestProcessorOptions); var crawlRunner = new CrawlRunner(baseUri, robotsFile, HttpClient, settings, Logger); //Use any links referred to by the sitemap as a starting point var urisFromSitemap = (await new SitemapQuery(HttpClient) .GetAllSitemapsForDomain(siteUri.Host)) .SelectMany(s => s.Urls.Select(u => u.Location).Distinct()); foreach (var uri in urisFromSitemap) { crawlRunner.AddRequest(uri); } result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) => { var response = requestResult.ResponseMessage; var crawlRequest = new CrawlRequest { RequestStart = requestResult.RequestStart, ElapsedTime = requestResult.ElapsedTime, StatusCode = response.StatusCode, IsSuccessfulStatus = response.IsSuccessStatusCode }; crawlState.Requests.Add(crawlRequest); var redirectStatusCodes = new[] { HttpStatusCode.MovedPermanently, HttpStatusCode.Redirect, HttpStatusCode.TemporaryRedirect }; if (redirectStatusCodes.Contains(crawlRequest.StatusCode.Value)) { crawlRunner.AddRedirect(crawlState.Location, response.Headers.Location); } else if (crawlRequest.IsSuccessfulStatus) { using (var contentStream = await response.Content.ReadAsStreamAsync()) { var headers = new CrawlHeaders(response.Headers, response.Content.Headers); var content = settings.ContentProcessor.Parse(crawlState.Location, headers, contentStream); contentStream.Seek(0, SeekOrigin.Begin); content.RawContent = await new StreamReader(contentStream).ReadToEndAsync(); crawlRunner.AddResult(crawlState.Location, content); } } else if ((int)crawlRequest.StatusCode >= 500 && (int)crawlRequest.StatusCode <= 599) { //On server errors, try to crawl the page again later crawlRunner.AddRequest(crawlState.Location); } else { //On any other error, just save what we have seen and move on //Consider the content of the request irrelevant crawlRunner.AddResult(crawlState.Location, null); } }); overallCrawlStopwatch.Stop(); result.ElapsedTime = overallCrawlStopwatch.Elapsed; return(result); }