public async Task <CrawlResult> Crawl(Uri siteUri, CrawlSettings settings) { var result = new CrawlResult { CrawlStart = DateTime.UtcNow }; var overallCrawlStopwatch = new Stopwatch(); overallCrawlStopwatch.Start(); var baseUri = new Uri(siteUri.GetLeftPart(UriPartial.Authority)); var robotsFile = await new RobotsFileParser(HttpClient).FromUriAsync(baseUri); UpdateCrawlDelay(robotsFile, settings.UserAgent, settings.RequestProcessorOptions); var crawlRunner = new CrawlRunner(baseUri, robotsFile, HttpClient, settings, Logger); //Use any links referred to by the sitemap as a starting point var urisFromSitemap = (await new SitemapQuery(HttpClient) .GetAllSitemapsForDomain(siteUri.Host)) .SelectMany(s => s.Urls.Select(u => u.Location).Distinct()); foreach (var uri in urisFromSitemap) { crawlRunner.AddRequest(uri); } result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) => { var response = requestResult.ResponseMessage; var crawlRequest = new CrawlRequest { RequestStart = requestResult.RequestStart, ElapsedTime = requestResult.ElapsedTime, StatusCode = response.StatusCode, IsSuccessfulStatus = response.IsSuccessStatusCode }; crawlState.Requests.Add(crawlRequest); var redirectStatusCodes = new[] { HttpStatusCode.MovedPermanently, HttpStatusCode.Redirect, HttpStatusCode.TemporaryRedirect }; if (redirectStatusCodes.Contains(crawlRequest.StatusCode.Value)) { crawlRunner.AddRedirect(crawlState.Location, response.Headers.Location); } else if (crawlRequest.IsSuccessfulStatus) { using (var contentStream = await response.Content.ReadAsStreamAsync()) { var headers = new CrawlHeaders(response.Headers, response.Content.Headers); var content = settings.ContentProcessor.Parse(crawlState.Location, headers, contentStream); contentStream.Seek(0, SeekOrigin.Begin); content.RawContent = await new StreamReader(contentStream).ReadToEndAsync(); crawlRunner.AddResult(crawlState.Location, content); } } else if ((int)crawlRequest.StatusCode >= 500 && (int)crawlRequest.StatusCode <= 599) { //On server errors, try to crawl the page again later crawlRunner.AddRequest(crawlState.Location); } else { //On any other error, just save what we have seen and move on //Consider the content of the request irrelevant crawlRunner.AddResult(crawlState.Location, null); } }); overallCrawlStopwatch.Stop(); result.ElapsedTime = overallCrawlStopwatch.Elapsed; return(result); }
private async Task <CrawledUri> PerformRequest(UriCrawlState crawlState, ConcurrentQueue <UriCrawlState> pagesToCrawl, CrawlContext context) { var crawlRequest = new CrawlRequest { RequestStart = DateTime.UtcNow, }; var stopwatch = new Stopwatch(); stopwatch.Start(); using (var response = await HttpClient.GetAsync(crawlState.Location)) { crawlRequest.StatusCode = response.StatusCode; crawlRequest.IsSuccessfulStatus = response.IsSuccessStatusCode; await response.Content.LoadIntoBufferAsync(); stopwatch.Stop(); crawlRequest.ElapsedTime = stopwatch.Elapsed; crawlState.Requests.Add(crawlRequest); var redirectStatusCodes = new[] { HttpStatusCode.MovedPermanently, HttpStatusCode.Redirect, HttpStatusCode.TemporaryRedirect }; if (redirectStatusCodes.Contains(crawlRequest.StatusCode)) { var headerLocation = response.Headers.Location; var redirectCrawlState = new UriCrawlState { Location = new Uri(crawlState.Location, headerLocation.ToString()), Redirects = crawlState.Redirects ?? new List <CrawledUriRedirect>() }; redirectCrawlState.Redirects.Add(new CrawledUriRedirect { Location = crawlState.Location, Requests = crawlState.Requests }); pagesToCrawl.Enqueue(redirectCrawlState); context.SeenUris.TryAdd(headerLocation, 0); return(null); } else if (crawlRequest.IsSuccessfulStatus) { return(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.Crawled, RedirectChain = crawlState.Redirects, Requests = crawlState.Requests, Content = await context.Settings.ContentParser.Parse(crawlState.Location, response, context.Settings) }); } else if ((int)crawlRequest.StatusCode >= 500 && (int)crawlRequest.StatusCode <= 599) { //On server errors, try to crawl the page again later pagesToCrawl.Enqueue(crawlState); return(null); } else { //On any other error, just save what we have seen and move on //Consider the content of the request irrelevant return(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.Crawled, RedirectChain = crawlState.Redirects, Requests = crawlState.Requests }); } } }