static async Task Main(string[] args) { // default site and depth before grabbing from args. var startingUrl = "https://www.crawler-test.com/"; var maxDepth = 3; if (args.Length > 0) { startingUrl = args[0]; } if (args.Length > 1) { maxDepth = Convert.ToInt32(args[1]); } // Setup dependencies for the crawler. IDownloader downloader = new Downloader(); IHtmlParser parser = new HtmlParser(); // Initialise the crawler and hook into the crawled event. var crawler = new WebCrawer(downloader, parser); crawler.PageCrawled += (obj, page) => Console.WriteLine(FormatOutput(page)); Console.WriteLine($"Crawling {startingUrl} to depth {maxDepth}\n"); // Run the crawler! var result = await crawler.RunAsync(startingUrl, maxDepth); Console.WriteLine($"Max depth: {result.MaxDepth}"); Console.WriteLine($"Total links visited: {result.Links.Keys.Count}"); Console.WriteLine("Total crawl execution time: {0:00}:{1:00}.{2:00}", result.RunTime.TotalMinutes, result.RunTime.Seconds, result.RunTime.Milliseconds / 10); }
public async Task Test_Crawler_RunAsync() { // Arrange var testSite = "https://www.crawler-test.com/"; var crawler = new WebCrawer(new Downloader(), new HtmlParser()); // Act var result = await crawler.RunAsync(testSite); // Assert result.MaxDepth.Should().Be(1); result.RunTime.Should().BeGreaterThan(new TimeSpan()); result.Site.Should().BeEquivalentTo(new Uri(testSite)); result.Links.Count.Should().Be(1); // hard coded expected links - not ideal. result.Links.FirstOrDefault().Value.PageLinks.Count.Should().Be(412); // hard coded expected links - not ideal. }
public async Task Test_Crawler_StartAsync() { // Arrange var rootSite = "http://contoso.com"; var rootPageUri = new Uri(rootSite); var page1Uri = new Uri($"{rootSite}/page1"); var page2Uri = new Uri($"{rootSite}/page2"); var page3Uri = new Uri($"{rootSite}/page3"); var fakeHandler = new FakeResponseHandler(); fakeHandler.AddFakeResponse(rootPageUri, new HttpResponseMessage(HttpStatusCode.OK) { Content = new StringContent("<a href='/page1'>page1</a><a href='/page2'>page2</a><a href='#'>no link</a>") }); fakeHandler.AddFakeResponse(page1Uri, new HttpResponseMessage(HttpStatusCode.OK) { Content = new StringContent("<a href='#'></a><a href='https://www.google.com'></a><a href='/page1'>page1</a>") }); fakeHandler.AddFakeResponse(page2Uri, new HttpResponseMessage(HttpStatusCode.OK) { Content = new StringContent("<a href='#'></a><a href='https://www.facebook.com'></a><a href='/page3'></a>") }); fakeHandler.AddFakeResponse(page3Uri, new HttpResponseMessage(HttpStatusCode.OK) { Content = new StringContent("no links") }); IDownloader downloader = new Downloader(fakeHandler); IHtmlParser parser = new HtmlParser(); var crawler = new WebCrawer(downloader, parser); // Act var crawlResult = await crawler.RunAsync(rootSite, 3); var rootPage = crawlResult.Links[$"{rootSite}/"]; var page1 = crawlResult.Links[$"{rootSite}/page1"]; var page2 = crawlResult.Links[$"{rootSite}/page2"]; var page3 = crawlResult.Links[$"{rootSite}/page3"]; // Assert crawlResult.Should().NotBeNull(); crawlResult.Links.Count.Should().Be(4); crawlResult.MaxDepth.Should().Be(3); rootPage.FirstVisitedDepth.Should().Be(1); rootPage.PageLinks.Count.Should().Be(3); rootPage.PageLinks.Should().BeEquivalentTo(new List <string> { "http://contoso.com/page1", "http://contoso.com/page2", "#" }); page1.FirstVisitedDepth.Should().Be(2); page1.PageLinks.Count.Should().Be(3); page1.PageLinks.Should().BeEquivalentTo(new List <string> { "http://contoso.com/page1", "https://www.google.com", "#" }); page2.FirstVisitedDepth.Should().Be(2); page2.PageLinks.Count.Should().Be(3); page3.FirstVisitedDepth.Should().Be(3); page3.PageLinks.Should().BeNull(); }