Beispiel #1
0
        public async void Crawl()
        {
            var runner  = new CrawlRunner(_baseUrl, new HttpClient(new FakeHandler()), _parser, _sanitiser);
            var results = await runner.Crawl();

            var testPage     = new Uri("https://testing.com/");
            var expectedPage = new Page
            {
                InLinks = new List <Uri> {
                    new Uri("https://testing.com/second.html")
                },
                OutLinks = new List <Uri> {
                    new Uri("https://testing.com/first.html"), new Uri("https://testing.com/second.html"), new Uri("https://testing.com/error.html")
                },
                Status = CrawlStatus.Success,
                Title  = "Index page",
                Url    = testPage
            };

            Assert.Equal(expectedPage.Title, results[testPage].Title);
            Assert.Equal(expectedPage.Status, results[testPage].Status);
            Assert.Equal(expectedPage.Url, results[testPage].Url);
            Assert.Equal(expectedPage.OutLinks, results[testPage].OutLinks);
            Assert.Equal(expectedPage.InLinks, results[testPage].InLinks);
        }
Beispiel #2
0
        public async void Parse_Links_Sanitises()
        {
            var fakesanitiser = A.Fake <UrlSanitiser>(x => x.CallsBaseMethods().WithArgumentsForConstructor(() => new UrlSanitiser(_baseUrl)));
            var runner        = new CrawlRunner(_baseUrl, new HttpClient(new FakeHandler()), _parser, fakesanitiser);
            var results       = await runner.Crawl();

            A.CallTo(() => fakesanitiser.SanitiseLocal(A <string> .Ignored)).MustHaveHappened();
        }
Beispiel #3
0
        public async void Parse_Title()
        {
            var runner  = new CrawlRunner(_baseUrl, new HttpClient(new FakeHandler()), _parser, _sanitiser);
            var results = await runner.Crawl();

            Assert.Equal("Index page", results[new Uri("https://testing.com")].Title);
            Assert.Equal("First page", results[new Uri("https://testing.com/first.html")].Title);
            Assert.Equal("Second page", results[new Uri("https://testing.com/second.html")].Title);
        }
Beispiel #4
0
        public async void Parse_Links_Duplicate()
        {
            var runner  = new CrawlRunner(_baseUrl, new HttpClient(new FakeHandler()), _parser, _sanitiser);
            var results = await runner.Crawl();

            var expectedLinks = new List <string> {
                "https://testing.com/", "https://testing.com/first.html"
            };
            var actualLinks = results[new Uri("https://testing.com/second.html")].OutLinks.Select(x => x.ToString()).ToList();

            Assert.Equal(expectedLinks, actualLinks);
        }
Beispiel #5
0
        public async Task <CrawlResult> Crawl(Uri siteUri, CrawlSettings settings)
        {
            var result = new CrawlResult
            {
                CrawlStart = DateTime.UtcNow
            };
            var overallCrawlStopwatch = new Stopwatch();

            overallCrawlStopwatch.Start();

            var baseUri    = new Uri(siteUri.GetLeftPart(UriPartial.Authority));
            var robotsFile = await new RobotsFileParser(HttpClient).FromUriAsync(baseUri);

            UpdateCrawlDelay(robotsFile, settings.UserAgent, settings.RequestProcessorOptions);

            var crawlRunner = new CrawlRunner(baseUri, robotsFile, HttpClient, settings, Logger);

            //Use any links referred to by the sitemap as a starting point
            var urisFromSitemap = (await new SitemapQuery(HttpClient)
                                   .GetAllSitemapsForDomainAsync(siteUri.Host))
                                  .SelectMany(s => s.Urls.Select(u => u.Location).Distinct());

            foreach (var uri in urisFromSitemap)
            {
                crawlRunner.AddRequest(uri);
            }

            result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) =>
            {
                using (requestResult.Content)
                {
                    var headers = new CrawlHeaders(requestResult.ResponseHeaders, requestResult.ContentHeaders);
                    var content = settings.ContentProcessor.Parse(crawlState.Location, headers, requestResult.Content);
                    requestResult.Content.Seek(0, SeekOrigin.Begin);
                    content.RawContent = await new StreamReader(requestResult.Content).ReadToEndAsync();
                    crawlRunner.AddResult(crawlState.Location, content);
                }
            });

            overallCrawlStopwatch.Stop();
            result.ElapsedTime = overallCrawlStopwatch.Elapsed;
            return(result);
        }
Beispiel #6
0
        public async Task <CrawlResult> Crawl(Uri siteUri, CrawlSettings settings)
        {
            var result = new CrawlResult
            {
                CrawlStart = DateTime.UtcNow
            };
            var overallCrawlStopwatch = new Stopwatch();

            overallCrawlStopwatch.Start();

            var baseUri    = new Uri(siteUri.GetLeftPart(UriPartial.Authority));
            var robotsFile = await new RobotsFileParser(HttpClient).FromUriAsync(baseUri);

            UpdateCrawlDelay(robotsFile, settings.UserAgent, settings.RequestProcessorOptions);

            var crawlRunner = new CrawlRunner(baseUri, robotsFile, HttpClient, settings, Logger);

            //Use any links referred to by the sitemap as a starting point
            var urisFromSitemap = (await new SitemapQuery(HttpClient)
                                   .GetAllSitemapsForDomain(siteUri.Host))
                                  .SelectMany(s => s.Urls.Select(u => u.Location).Distinct());

            foreach (var uri in urisFromSitemap)
            {
                crawlRunner.AddRequest(uri);
            }

            result.CrawledUris = await crawlRunner.ProcessAsync(async (requestResult, crawlState) =>
            {
                var response = requestResult.ResponseMessage;

                var crawlRequest = new CrawlRequest
                {
                    RequestStart       = requestResult.RequestStart,
                    ElapsedTime        = requestResult.ElapsedTime,
                    StatusCode         = response.StatusCode,
                    IsSuccessfulStatus = response.IsSuccessStatusCode
                };
                crawlState.Requests.Add(crawlRequest);

                var redirectStatusCodes = new[]
                {
                    HttpStatusCode.MovedPermanently,
                    HttpStatusCode.Redirect,
                    HttpStatusCode.TemporaryRedirect
                };
                if (redirectStatusCodes.Contains(crawlRequest.StatusCode.Value))
                {
                    crawlRunner.AddRedirect(crawlState.Location, response.Headers.Location);
                }
                else if (crawlRequest.IsSuccessfulStatus)
                {
                    using (var contentStream = await response.Content.ReadAsStreamAsync())
                    {
                        var headers = new CrawlHeaders(response.Headers, response.Content.Headers);
                        var content = settings.ContentProcessor.Parse(crawlState.Location, headers, contentStream);
                        contentStream.Seek(0, SeekOrigin.Begin);
                        content.RawContent = await new StreamReader(contentStream).ReadToEndAsync();
                        crawlRunner.AddResult(crawlState.Location, content);
                    }
                }
                else if ((int)crawlRequest.StatusCode >= 500 && (int)crawlRequest.StatusCode <= 599)
                {
                    //On server errors, try to crawl the page again later
                    crawlRunner.AddRequest(crawlState.Location);
                }
                else
                {
                    //On any other error, just save what we have seen and move on
                    //Consider the content of the request irrelevant
                    crawlRunner.AddResult(crawlState.Location, null);
                }
            });

            overallCrawlStopwatch.Stop();
            result.ElapsedTime = overallCrawlStopwatch.Elapsed;
            return(result);
        }