Example #1
1
        // Setting up bot config
        public void setup_abot()
        {
            CrawlConfiguration crawlConfig = new CrawlConfiguration();

            crawlConfig.CrawlTimeoutSeconds = 150;
            crawlConfig.MaxConcurrentThreads = 25;
            crawlConfig.IsExternalPageCrawlingEnabled = false;
            crawlConfig.MaxCrawlDepth = 1;
            crawlConfig.MaxPagesToCrawl = 1000;
            crawlConfig.UserAgentString = "abot v1.0 http://code.google.com/p/abot";

            crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null);

            crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;

            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                Regex rx = new Regex(@"\d{5}");

                if (!rx.IsMatch(pageToCrawl.Uri.ToString()) && !pageToCrawl.Uri.ToString().Contains("text="))
                    return new CrawlDecision { Allow = false, Reason = "Want only comlinks" };

                return new CrawlDecision { Allow = true, Reason = "OK Link" }; ;
            });
        }
        public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_CrawlDelayAboveMinDomainCrawlDelay_CallsDomainRateLimiter()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List<Uri> links = new List<Uri> { uri1, uri2 };

            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });

            _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny<string>())).Returns(3);//this is more then the max configured crawl delay (should be ignored)
            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny<string>(), It.IsAny<string>())).Returns(true);
            _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny<Uri>())).Returns(_fakeRobotsDotText.Object);

            _dummyConfiguration.IsRespectRobotsDotTextEnabled = true;//BY HAVING A THIS EQUAL TO TRUE WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
            _dummyConfiguration.MaxRobotsDotTextCrawlDelayInSeconds = 2; //This is less than the crawl delay (Should Be used)
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeHttpRequester.VerifyAll();
            _fakeHyperLinkParser.VerifyAll();
            _fakeRobotsDotText.VerifyAll();
            _fakeRobotsDotTextFinder.VerifyAll();
            _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny<Uri>(), 2000), Times.Exactly(1));
            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
        }
Example #3
0
        static void Main(string[] args)
        {
            CrawlConfiguration config = new CrawlConfiguration();
            config.MaxConcurrentThreads = 1; // Web Extractor is not currently thread-safe.

            // Create the PhantomJS instance. This will spawn a new PhantomJS process using phantomjs.exe.
            // Make sure to dispose this instance or you will have a zombie process!
            IWebDriver driver = CreatePhantomJsDriver(config);

            // Create the content extractor that uses PhantomJS.
            IWebContentExtractor extractor = new JavaScriptContentExtractor(driver);

            // Create a PageRequester that will use the extractor.
            IPageRequester requester = new PageRequester(config, extractor);

            using (IWebCrawler crawler = new PoliteWebCrawler(config, null, null, null, requester, null, null, null, null)) {
                crawler.PageCrawlCompleted += OnPageCrawlCompleted;

                CrawlResult result = crawler.Crawl(new Uri("http://wvtesting2.com/"));
                if (result.ErrorOccurred)
                    Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
                else
                    Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
            }

            Console.Read();
        }
Example #4
0
        static void Main(string[] args)
        {
            CancellationTokenSource cancellationTokenSource = new CancellationTokenSource();

            PoliteWebCrawler crawler = new PoliteWebCrawler();
            crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            crawler.ShouldCrawlPage((crawledPage, crawledContext) =>
            {
                CrawlDecision decision = new CrawlDecision();
                var uri = crawledPage.Uri.ToString();
                if (crawledPage.IsRoot || uri.StartsWith("http://www.tingchina.com/erge/"))
                {
                    decision.Allow = true;
                }
                else
                {
                    decision.Allow = false;
                    decision.Reason = "Just erge pages!";
                }
                return decision;
            });

            CrawlResult result = crawler.Crawl(new Uri("http://www.tingchina.com/"), cancellationTokenSource);

            if (result.ErrorOccurred)
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            else
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);

            Console.ReadLine();
        }
        public void Crawl(Uri uri, Action<Page> callback)
        {
            var crawlConfig = new CrawlConfiguration
            {
                CrawlTimeoutSeconds = 0,
                MaxConcurrentThreads = 5,
                UserAgentString = "InspectionCrawler v1.0",
                MinCrawlDelayPerDomainMilliSeconds = 1000,
                MaxPagesToCrawl = 0,
                MaxPagesToCrawlPerDomain = 0,
                MaxCrawlDepth = int.MaxValue
            };

            var crawler = new PoliteWebCrawler(crawlConfig);

            crawler.PageCrawlCompletedAsync += (sender, args) =>
            {
                var page = args.CrawledPage;

                if (page.WebException != null && page.HttpWebResponse == null)
                {
                    _log.Log(new LogMessage(LogType.Error, "Could not get page", page.WebException, page.Uri));
                    return;
                }

                callback(Convert(args.CrawledPage));

            };

            crawler.Crawl(uri);
        }
Example #6
0
        public void Crawl_MinCrawlDelayDelayZero_DomainRateLimiterNotCalled()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent 
                { 
                    Text = "content here" 
                }
            };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List<Uri> links = new List<Uri> { uri1, uri2 };
            
            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });

            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Never());
        }
Example #7
0
 public WebCrawler(ICrawlingStats crawlingStats, IResultWriter resultWriter, IClock clock)
 {
     _crawlingStats = crawlingStats;
     _resultWriter = resultWriter;
     _clock = clock;
     _webCrawler = new PoliteWebCrawler();
     _webCrawler.PageCrawlCompletedAsync += ProcessPageCrawlCompleted;
     _startCrawlingTime = _clock.FormattedCurrentTime();
     //_resultFilePath = System.Configuration.ConfigurationManager.AppSettings["ResultFileName"];
 }
Example #8
0
        public void Crawl_MaxPagesTo25_OnlyCrawls25Pages()
        {
            new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters"));

            CrawlConfiguration configuration = new CrawlConfiguration();
            configuration.MaxPagesToCrawl = 25;

            int pagesCrawledCount = 0;

            PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null);
            crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++;

            crawler.Crawl(new Uri("http://localhost.fiddler:1111/"));

            Assert.AreEqual(25, pagesCrawledCount);
        }
Example #9
0
        public void Crawl_CrawlTimeoutIs1Sec_TimesOut()
        {
            CrawlConfiguration configuration = new CrawlConfiguration ();
            configuration.CrawlTimeoutSeconds = 1;

            int pagesCrawledCount = 0;

            PoliteWebCrawler crawler = new PoliteWebCrawler (configuration, null, null, null, null, null, null, null, null);
            crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++;

            CrawlResult result = crawler.Crawl (new Uri ("http://wvtesting2.com/"));

            Assert.IsFalse (result.ErrorOccurred);
            Assert.IsTrue (result.Elapsed.TotalSeconds < 5);
            Assert.IsTrue (pagesCrawledCount > 0);
        }
        public void CrawlAsync()
        {
            PoliteWebCrawler crawler = new PoliteWebCrawler();
            crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;

            CancellationTokenSource cancellationTokenSource = new CancellationTokenSource();

            CrawlResult result = crawler.Crawl(new Uri("http://www.funda.nl"), cancellationTokenSource);

            if (result.ErrorOccurred)
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message);
            else
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
        }
Example #11
0
        public void Crawl_MaxPagesTo5_WithCrawlDelay_OnlyCrawls5Pages()
        {
            new PageRequester(new CrawlConfiguration{ UserAgentString = "aaa" }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters"));

            CrawlConfiguration configuration = new CrawlConfiguration();
            configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; //adding delay since it increases the chance of issues with abot crawling more than MaxPagesToCrawl.
            configuration.MaxPagesToCrawl = 5;

            int pagesCrawledCount = 0;

            PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null);
            crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++;

            crawler.Crawl(new Uri("http://localhost.fiddler:1111/"));

            Assert.AreEqual(5, pagesCrawledCount);
        }
Example #12
0
        public void Crawl_Asynchronous_CancellationTokenCancelled_StopsCrawl()
        {
            CancellationTokenSource cancellationTokenSource = new CancellationTokenSource();
            System.Timers.Timer timer = new System.Timers.Timer(800);
            timer.Elapsed += (o, e) =>
            {
                cancellationTokenSource.Cancel();
                timer.Stop();
                timer.Dispose();
            };
            timer.Start();

            PoliteWebCrawler crawler = new PoliteWebCrawler();
            Task<CrawlResult> task = Task.Factory.StartNew<CrawlResult>(() => crawler.Crawl(new Uri("http://localhost.fiddler:1111/"), cancellationTokenSource));
            CrawlResult result = task.Result;

            Assert.IsTrue(result.ErrorOccurred);
            Assert.IsTrue(result.ErrorException is OperationCanceledException);
        }
Example #13
0
        public void Crawl_IsRateLimited()
        {
            new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters"));

            CrawlConfiguration configuration = new CrawlConfiguration();
            configuration.MaxPagesToCrawl = 3;
            configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; // 1 second * 2 pages = 2 (or more) seconds

            int pagesCrawledCount = 0;

            var crawler = new PoliteWebCrawler(configuration);
            crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++;

            var uriToCrawl = new Uri("http://localhost.fiddler:1111/");
            var start = DateTime.Now;
            crawler.Crawl(uriToCrawl);
            var elapsed = DateTime.Now - start;

            Assert.GreaterOrEqual(elapsed.TotalMilliseconds, 2000);
            Assert.AreEqual(3, pagesCrawledCount);
        }
Example #14
0
        private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler()
        {
            IWebCrawler crawler = new PoliteWebCrawler();

            //Register a lambda expression that will make Abot not crawl any url that has the word "ghost" in it.
            //For example http://a.com/ghost, would not get crawled if the link were found during the crawl.
            //If you set the log4net log level to "DEBUG" you will see a log message when any page is not allowed to be crawled.
            //NOTE: This is lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPage method is run.
            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                if (pageToCrawl.Uri.AbsoluteUri.Contains("ghost"))
                    return new CrawlDecision { Allow = false, Reason = "Scared of ghosts" };

                return new CrawlDecision { Allow = true };
            });

            //Register a lambda expression that will tell Abot to not download the page content for any page after 5th.
            //Abot will still make the http request but will not read the raw content from the stream
            //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldDownloadPageContent method is run
            crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>
            {
                if (crawlContext.CrawledCount >= 5)
                    return new CrawlDecision { Allow = false, Reason = "We already downloaded the raw page content for 5 pages" };

                return new CrawlDecision { Allow = true };
            });

            //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri.
            //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run
            crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
            {
                if (!crawledPage.IsInternal)
                    return new CrawlDecision { Allow = false, Reason = "We dont crawl links of external pages" };

                return new CrawlDecision { Allow = true };
            });

            return crawler;
        }
Example #15
0
        static void Main(string[] args)
        {
            //Read Configuration from File
            connectionString = fileTyp.GetConnectionString(appDataPath);
            webURL = fileTyp.GetHostToCrawlString(appDataPath);

            //Will Get the FileTypes to Download
            filters = fileTyp.GetFileTypesToDownlaod(fileTypePath);
            //Will use app.config for confguration
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            #region "Crawler Events"
            crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
            #endregion

            CrawlResult result = crawler.Crawl(new Uri(webURL));
            if (result.ErrorOccurred)
                Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.ToString());
            else
                Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri);
        }
Example #16
0
        public static Dictionary<String, CrawledWebPage> Run()
        {
            PoliteWebCrawler crawler = new PoliteWebCrawler(_crawlConfig, null, null, null, null, null, null, null, null);

            //Registers on the events
            crawler.PageCrawlStartingAsync += ProcessPageCrawlStarting;
            crawler.PageCrawlCompletedAsync += ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync += PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += PageLinksCrawlDisallowed;

            UriBag uris = new UriBag();
            uris.Add("carris", "http://www.carris.pt");
            uris.Add("tfl", "http://www.tfl.gov.uk/");
            uris.Add("publico", "http://www.publico.pt");
            uris.Add("CP", "http://www.cp.pt");
            uris.Add("GVB", "http://en.gvb.nl/");

            crawler.Crawl(uris.Get("CP"));

            return _crawledPages;

            //Tries to save the webpages information
            //Utils.Save(CrawledPages);
        }
        public void Crawl_MinCrawlDelayGreaterThanZero_CallsDomainRateLimiter()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List<Uri> links = new List<Uri> { uri1, uri2 };

            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });

            _dummyConfiguration.MinCrawlDelayPerDomainMilliSeconds = 1;//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED
        }
        public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_ZeroCrawlDelay_DoesNotCallsDomainRateLimiter()
        {
            Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html");
            Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html");

            CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" };
            CrawledPage page1 = new CrawledPage(uri1);
            CrawledPage page2 = new CrawledPage(uri2);

            List<Uri> links = new List<Uri> { uri1, uri2 };

            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
            _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2);
            _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });

            _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny<string>())).Returns(0);
            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny<string>(), It.IsAny<string>())).Returns(true);
            _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny<Uri>())).Returns(_fakeRobotsDotText.Object);

            _dummyConfiguration.IsRespectRobotsDotTextEnabled = true;
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeHttpRequester.VerifyAll();
            _fakeHyperLinkParser.VerifyAll();
            _fakeRobotsDotText.VerifyAll();
            _fakeRobotsDotTextFinder.VerifyAll();
            _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny<Uri>(), It.IsAny<long>()), Times.Exactly(0));
            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Exactly(0));
        }
Example #19
0
        public void Crawl_MinCrawlDelayDelayZero_StillCallsDomainRateLimiter()
        {
            CrawledPage homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };

            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });

            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Exactly(1));
        }
Example #20
0
        public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_RootPageIsAllowed_AllPagesBelowDisallowed_IsIgnoreRobotsDotTextIfRootDisallowedEnabledTrue_CallsHttpRequester()
        {
            CrawledPage homePage = new CrawledPage(_rootUri)
            {
                Content = new PageContent
                {
                    Text = "content here"
                }
            };
            CrawledPage page1 = new CrawledPage(_rootUri);

            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(_rootUri.AbsoluteUri, It.IsAny<string>())).Returns(true);
            _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(_rootUri.AbsoluteUri + "aaaaa", It.IsAny<string>())).Returns(false);
            _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny<Uri>())).Returns(_fakeRobotsDotText.Object);
            _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1);
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
            _dummyConfiguration.IsRespectRobotsDotTextEnabled = true;
            _dummyConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled = true;
            _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object);

            _unitUnderTest.Crawl(_rootUri);

            _fakeCrawlDecisionMaker.VerifyAll();
            _fakeRobotsDotText.VerifyAll();
            _fakeRobotsDotTextFinder.VerifyAll();
            _fakeHttpRequester.VerifyAll();
        }
Example #21
0
        public void Crawl_CrawlTimeoutIs1Sec_TimesOut()
        {
            CrawlConfiguration configuration = new CrawlConfiguration();
            configuration.CrawlTimeoutSeconds = 2;

            int pagesCrawledCount = 0;

            PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null);
            crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++;

            CrawlResult result = crawler.Crawl(new Uri("http://localhost.fiddler:1111/"));

            Assert.IsFalse(result.ErrorOccurred);
            Assert.IsTrue(result.Elapsed.TotalSeconds < 8, "Took more than 8 seconds");
            Assert.IsTrue(pagesCrawledCount < 2, "Crawled more than 2 pages");
        }