public async void Crawl() { var runner = new CrawlRunner(_baseUrl, new HttpClient(new FakeHandler()), _parser, _sanitiser); var results = await runner.Crawl(); var testPage = new Uri("https://testing.com/"); var expectedPage = new Page { InLinks = new List <Uri> { new Uri("https://testing.com/second.html") }, OutLinks = new List <Uri> { new Uri("https://testing.com/first.html"), new Uri("https://testing.com/second.html"), new Uri("https://testing.com/error.html") }, Status = CrawlStatus.Success, Title = "Index page", Url = testPage }; Assert.Equal(expectedPage.Title, results[testPage].Title); Assert.Equal(expectedPage.Status, results[testPage].Status); Assert.Equal(expectedPage.Url, results[testPage].Url); Assert.Equal(expectedPage.OutLinks, results[testPage].OutLinks); Assert.Equal(expectedPage.InLinks, results[testPage].InLinks); }
public async void Parse_Links_Sanitises() { var fakesanitiser = A.Fake <UrlSanitiser>(x => x.CallsBaseMethods().WithArgumentsForConstructor(() => new UrlSanitiser(_baseUrl))); var runner = new CrawlRunner(_baseUrl, new HttpClient(new FakeHandler()), _parser, fakesanitiser); var results = await runner.Crawl(); A.CallTo(() => fakesanitiser.SanitiseLocal(A <string> .Ignored)).MustHaveHappened(); }
public async void Parse_Title() { var runner = new CrawlRunner(_baseUrl, new HttpClient(new FakeHandler()), _parser, _sanitiser); var results = await runner.Crawl(); Assert.Equal("Index page", results[new Uri("https://testing.com")].Title); Assert.Equal("First page", results[new Uri("https://testing.com/first.html")].Title); Assert.Equal("Second page", results[new Uri("https://testing.com/second.html")].Title); }
public async void Parse_Links_Duplicate() { var runner = new CrawlRunner(_baseUrl, new HttpClient(new FakeHandler()), _parser, _sanitiser); var results = await runner.Crawl(); var expectedLinks = new List <string> { "https://testing.com/", "https://testing.com/first.html" }; var actualLinks = results[new Uri("https://testing.com/second.html")].OutLinks.Select(x => x.ToString()).ToList(); Assert.Equal(expectedLinks, actualLinks); }
public async Task <CrawlResult> Crawl(Uri siteUri, CrawlSettings settings) { var result = new CrawlResult { CrawlStart = DateTime.UtcNow }; var overallCrawlStopwatch = new Stopwatch(); overallCrawlStopwatch.Start(); var baseUri = new Uri(siteUri.GetLeftPart(UriPartial.Authority)); var robotsFile = await new RobotsFileParser(HttpClient).FromUriAsync(baseUri); UpdateCrawlDelay(robotsFile, settings.UserAgent, settings.RequestProcessorOptions); var crawlRunner = new CrawlRunner(baseUri, robotsFile, HttpClient, settings, Logger); //Use any links referred to by the sitemap as a starting point var urisFromSitemap = (await new SitemapQuery(HttpClient) .GetAllSitemapsForDomainAsync(siteUri.Host)) .SelectMany(s => s.Urls.Select(u => u.Location).Distinct()); foreach (var uri in urisFromSitemap) { crawlRunner.AddRequest(uri); } result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) => { using (requestResult.Content) { var headers = new CrawlHeaders(requestResult.ResponseHeaders, requestResult.ContentHeaders); var content = settings.ContentProcessor.Parse(crawlState.Location, headers, requestResult.Content); requestResult.Content.Seek(0, SeekOrigin.Begin); content.RawContent = await new StreamReader(requestResult.Content).ReadToEndAsync(); crawlRunner.AddResult(crawlState.Location, content); } }); overallCrawlStopwatch.Stop(); result.ElapsedTime = overallCrawlStopwatch.Elapsed; return(result); }
public async Task <CrawlResult> Crawl(Uri siteUri, CrawlSettings settings) { var result = new CrawlResult { CrawlStart = DateTime.UtcNow }; var overallCrawlStopwatch = new Stopwatch(); overallCrawlStopwatch.Start(); var baseUri = new Uri(siteUri.GetLeftPart(UriPartial.Authority)); var robotsFile = await new RobotsFileParser(HttpClient).FromUriAsync(baseUri); UpdateCrawlDelay(robotsFile, settings.UserAgent, settings.RequestProcessorOptions); var crawlRunner = new CrawlRunner(baseUri, robotsFile, HttpClient, settings, Logger); //Use any links referred to by the sitemap as a starting point var urisFromSitemap = (await new SitemapQuery(HttpClient) .GetAllSitemapsForDomain(siteUri.Host)) .SelectMany(s => s.Urls.Select(u => u.Location).Distinct()); foreach (var uri in urisFromSitemap) { crawlRunner.AddRequest(uri); } result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) => { var response = requestResult.ResponseMessage; var crawlRequest = new CrawlRequest { RequestStart = requestResult.RequestStart, ElapsedTime = requestResult.ElapsedTime, StatusCode = response.StatusCode, IsSuccessfulStatus = response.IsSuccessStatusCode }; crawlState.Requests.Add(crawlRequest); var redirectStatusCodes = new[] { HttpStatusCode.MovedPermanently, HttpStatusCode.Redirect, HttpStatusCode.TemporaryRedirect }; if (redirectStatusCodes.Contains(crawlRequest.StatusCode.Value)) { crawlRunner.AddRedirect(crawlState.Location, response.Headers.Location); } else if (crawlRequest.IsSuccessfulStatus) { using (var contentStream = await response.Content.ReadAsStreamAsync()) { var headers = new CrawlHeaders(response.Headers, response.Content.Headers); var content = settings.ContentProcessor.Parse(crawlState.Location, headers, contentStream); contentStream.Seek(0, SeekOrigin.Begin); content.RawContent = await new StreamReader(contentStream).ReadToEndAsync(); crawlRunner.AddResult(crawlState.Location, content); } } else if ((int)crawlRequest.StatusCode >= 500 && (int)crawlRequest.StatusCode <= 599) { //On server errors, try to crawl the page again later crawlRunner.AddRequest(crawlState.Location); } else { //On any other error, just save what we have seen and move on //Consider the content of the request irrelevant crawlRunner.AddResult(crawlState.Location, null); } }); overallCrawlStopwatch.Stop(); result.ElapsedTime = overallCrawlStopwatch.Elapsed; return(result); }