Beispiel #1
0
        static async Task Main(string[] args)
        {
            // default site and depth before grabbing from args.
            var startingUrl = "https://www.crawler-test.com/";
            var maxDepth    = 3;

            if (args.Length > 0)
            {
                startingUrl = args[0];
            }
            if (args.Length > 1)
            {
                maxDepth = Convert.ToInt32(args[1]);
            }

            // Setup dependencies for the crawler.
            IDownloader downloader = new Downloader();
            IHtmlParser parser     = new HtmlParser();

            // Initialise the crawler and hook into the crawled event.
            var crawler = new WebCrawer(downloader, parser);

            crawler.PageCrawled += (obj, page) => Console.WriteLine(FormatOutput(page));

            Console.WriteLine($"Crawling {startingUrl} to depth {maxDepth}\n");

            // Run the crawler!
            var result = await crawler.RunAsync(startingUrl, maxDepth);

            Console.WriteLine($"Max depth: {result.MaxDepth}");
            Console.WriteLine($"Total links visited: {result.Links.Keys.Count}");
            Console.WriteLine("Total crawl execution time: {0:00}:{1:00}.{2:00}", result.RunTime.TotalMinutes, result.RunTime.Seconds, result.RunTime.Milliseconds / 10);
        }
Beispiel #2
0
        public async Task Test_Crawler_RunAsync()
        {
            // Arrange
            var testSite = "https://www.crawler-test.com/";
            var crawler  = new WebCrawer(new Downloader(), new HtmlParser());

            // Act
            var result = await crawler.RunAsync(testSite);

            // Assert
            result.MaxDepth.Should().Be(1);
            result.RunTime.Should().BeGreaterThan(new TimeSpan());
            result.Site.Should().BeEquivalentTo(new Uri(testSite));
            result.Links.Count.Should().Be(1);                                    // hard coded expected links - not ideal.
            result.Links.FirstOrDefault().Value.PageLinks.Count.Should().Be(412); // hard coded expected links - not ideal.
        }
        public async Task Test_Crawler_StartAsync()
        {
            // Arrange
            var rootSite    = "http://contoso.com";
            var rootPageUri = new Uri(rootSite);
            var page1Uri    = new Uri($"{rootSite}/page1");
            var page2Uri    = new Uri($"{rootSite}/page2");
            var page3Uri    = new Uri($"{rootSite}/page3");
            var fakeHandler = new FakeResponseHandler();

            fakeHandler.AddFakeResponse(rootPageUri, new HttpResponseMessage(HttpStatusCode.OK)
            {
                Content = new StringContent("<a href='/page1'>page1</a><a href='/page2'>page2</a><a href='#'>no link</a>")
            });
            fakeHandler.AddFakeResponse(page1Uri, new HttpResponseMessage(HttpStatusCode.OK)
            {
                Content = new StringContent("<a href='#'></a><a href='https://www.google.com'></a><a href='/page1'>page1</a>")
            });
            fakeHandler.AddFakeResponse(page2Uri, new HttpResponseMessage(HttpStatusCode.OK)
            {
                Content = new StringContent("<a href='#'></a><a href='https://www.facebook.com'></a><a href='/page3'></a>")
            });
            fakeHandler.AddFakeResponse(page3Uri, new HttpResponseMessage(HttpStatusCode.OK)
            {
                Content = new StringContent("no links")
            });

            IDownloader downloader = new Downloader(fakeHandler);
            IHtmlParser parser     = new HtmlParser();
            var         crawler    = new WebCrawer(downloader, parser);

            // Act
            var crawlResult = await crawler.RunAsync(rootSite, 3);

            var rootPage = crawlResult.Links[$"{rootSite}/"];
            var page1    = crawlResult.Links[$"{rootSite}/page1"];
            var page2    = crawlResult.Links[$"{rootSite}/page2"];
            var page3    = crawlResult.Links[$"{rootSite}/page3"];

            // Assert
            crawlResult.Should().NotBeNull();
            crawlResult.Links.Count.Should().Be(4);
            crawlResult.MaxDepth.Should().Be(3);

            rootPage.FirstVisitedDepth.Should().Be(1);
            rootPage.PageLinks.Count.Should().Be(3);
            rootPage.PageLinks.Should().BeEquivalentTo(new List <string> {
                "http://contoso.com/page1", "http://contoso.com/page2", "#"
            });

            page1.FirstVisitedDepth.Should().Be(2);
            page1.PageLinks.Count.Should().Be(3);
            page1.PageLinks.Should().BeEquivalentTo(new List <string> {
                "http://contoso.com/page1", "https://www.google.com", "#"
            });

            page2.FirstVisitedDepth.Should().Be(2);
            page2.PageLinks.Count.Should().Be(3);

            page3.FirstVisitedDepth.Should().Be(3);
            page3.PageLinks.Should().BeNull();
        }