public async Task <CrawlResult> Crawl(Uri siteUri, CrawlSettings settings) { var result = new CrawlResult { CrawlStart = DateTime.UtcNow }; var overallCrawlStopwatch = new Stopwatch(); overallCrawlStopwatch.Start(); var baseUri = new Uri(siteUri.GetLeftPart(UriPartial.Authority)); var robotsFile = await new RobotsFileParser(HttpClient).FromUriAsync(baseUri); UpdateCrawlDelay(robotsFile, settings.UserAgent, settings.RequestProcessorOptions); var crawlRunner = new CrawlRunner(baseUri, robotsFile, HttpClient, settings, Logger); //Use any links referred to by the sitemap as a starting point var urisFromSitemap = (await new SitemapQuery(HttpClient) .GetAllSitemapsForDomainAsync(siteUri.Host)) .SelectMany(s => s.Urls.Select(u => u.Location).Distinct()); foreach (var uri in urisFromSitemap) { crawlRunner.AddRequest(uri); } result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) => { using (requestResult.Content) { var headers = new CrawlHeaders(requestResult.ResponseHeaders, requestResult.ContentHeaders); var content = settings.ContentProcessor.Parse(crawlState.Location, headers, requestResult.Content); requestResult.Content.Seek(0, SeekOrigin.Begin); content.RawContent = await new StreamReader(requestResult.Content).ReadToEndAsync(); crawlRunner.AddResult(crawlState.Location, content); } }); overallCrawlStopwatch.Stop(); result.ElapsedTime = overallCrawlStopwatch.Elapsed; return(result); }
public async Task <CrawlResult> Crawl(Uri siteUri, CrawlSettings settings) { var result = new CrawlResult { CrawlStart = DateTime.UtcNow }; var overallCrawlStopwatch = new Stopwatch(); overallCrawlStopwatch.Start(); var baseUri = new Uri(siteUri.GetLeftPart(UriPartial.Authority)); var robotsFile = await new RobotsFileParser(HttpClient).FromUriAsync(baseUri); UpdateCrawlDelay(robotsFile, settings.UserAgent, settings.RequestProcessorOptions); var crawlRunner = new CrawlRunner(baseUri, robotsFile, HttpClient, settings, Logger); //Use any links referred to by the sitemap as a starting point var urisFromSitemap = (await new SitemapQuery(HttpClient) .GetAllSitemapsForDomain(siteUri.Host)) .SelectMany(s => s.Urls.Select(u => u.Location).Distinct()); foreach (var uri in urisFromSitemap) { crawlRunner.AddRequest(uri); } result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) => { var response = requestResult.ResponseMessage; var crawlRequest = new CrawlRequest { RequestStart = requestResult.RequestStart, ElapsedTime = requestResult.ElapsedTime, StatusCode = response.StatusCode, IsSuccessfulStatus = response.IsSuccessStatusCode }; crawlState.Requests.Add(crawlRequest); var redirectStatusCodes = new[] { HttpStatusCode.MovedPermanently, HttpStatusCode.Redirect, HttpStatusCode.TemporaryRedirect }; if (redirectStatusCodes.Contains(crawlRequest.StatusCode.Value)) { crawlRunner.AddRedirect(crawlState.Location, response.Headers.Location); } else if (crawlRequest.IsSuccessfulStatus) { using (var contentStream = await response.Content.ReadAsStreamAsync()) { var headers = new CrawlHeaders(response.Headers, response.Content.Headers); var content = settings.ContentProcessor.Parse(crawlState.Location, headers, contentStream); contentStream.Seek(0, SeekOrigin.Begin); content.RawContent = await new StreamReader(contentStream).ReadToEndAsync(); crawlRunner.AddResult(crawlState.Location, content); } } else if ((int)crawlRequest.StatusCode >= 500 && (int)crawlRequest.StatusCode <= 599) { //On server errors, try to crawl the page again later crawlRunner.AddRequest(crawlState.Location); } else { //On any other error, just save what we have seen and move on //Consider the content of the request irrelevant crawlRunner.AddResult(crawlState.Location, null); } }); overallCrawlStopwatch.Stop(); result.ElapsedTime = overallCrawlStopwatch.Elapsed; return(result); }
public async Task <CrawlResult> Crawl(Uri siteUri, CrawlSettings settings) { var result = new CrawlResult { CrawlStart = DateTime.UtcNow }; var stopwatch = new Stopwatch(); stopwatch.Start(); var baseUri = new Uri(siteUri.GetLeftPart(UriPartial.Authority)); var robotsFile = await new RobotsParser(HttpClient).FromUriAsync(baseUri); //Apply Robots.txt crawl-delay (if defined) var userAgentEntry = robotsFile.GetEntryForUserAgent(settings.UserAgent); var minimumCrawlDelay = userAgentEntry?.CrawlDelay ?? 0; var taskDelay = Math.Max(minimumCrawlDelay * 1000, settings.TaskHandlerOptions.DelayBetweenTaskStart.TotalMilliseconds); settings.TaskHandlerOptions.DelayBetweenTaskStart = new TimeSpan(0, 0, 0, 0, (int)taskDelay); var seedUris = new List <UriCrawlState> { new UriCrawlState { Location = baseUri } }; //Use any links referred to by the sitemap as a starting point seedUris.AddRange((await new SitemapQuery(HttpClient) .GetAllSitemapsForDomain(siteUri.Host)) .SelectMany(s => s.Urls.Select(u => new UriCrawlState { Location = u.Location })) .Distinct() ); var crawlContext = new CrawlContext { Settings = settings }; await TaskHandler.For(seedUris.Distinct().ToArray(), async (crawlState, pagesToCrawl) => { if (!CheckUriValidity(crawlState.Location, baseUri, crawlContext)) { return; } if (crawlContext.CrawledUris.ContainsKey(crawlState.Location)) { return; } crawlContext.SeenUris.TryAdd(crawlState.Location, 0); var lastRequest = crawlState.Requests.LastOrDefault(); if (lastRequest != null && lastRequest.IsSuccessfulStatus) { return; } else if (crawlState.Requests.Count() == settings.NumberOfRetries) { crawlContext.CrawledUris.TryAdd(crawlState.Location, new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.MaxRetries, Requests = crawlState.Requests, RedirectChain = crawlState.Redirects }); } else if (robotsFile.IsAllowedAccess(crawlState.Location, settings.UserAgent)) { var crawledUri = await PerformRequest(crawlState, pagesToCrawl, crawlContext); if (crawledUri != null) { crawlContext.CrawledUris.TryAdd(crawlState.Location, crawledUri); if (crawledUri.Content?.Links?.Any() == true) { foreach (var crawlLink in crawledUri.Content.Links) { if (CheckUriValidity(crawlLink.Location, baseUri, crawlContext)) { if (crawlContext.SeenUris.ContainsKey(crawlLink.Location)) { continue; } crawlContext.SeenUris.TryAdd(crawlLink.Location, 0); pagesToCrawl.Enqueue(new UriCrawlState { Location = crawlLink.Location }); } } } } } else { crawlContext.CrawledUris.TryAdd(crawlState.Location, new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.RobotsBlocked }); } }, settings.TaskHandlerOptions); stopwatch.Stop(); result.ElapsedTime = stopwatch.Elapsed; result.CrawledUris = crawlContext.CrawledUris.Values; return(result); }