public void SetUp() { _fakeHyperLinkParser = new Mock<IHyperLinkParser>(); _fakeHttpRequester = new Mock<IPageRequester>(); _fakeCrawlDecisionMaker = new Mock<ICrawlDecisionMaker>(); _fakeMemoryManager = new Mock<IMemoryManager>(); _fakeDomainRateLimiter = new Mock<IDomainRateLimiter>(); _fakeRobotsDotTextFinder = new Mock<IRobotsDotTextFinder>(); _dummyScheduler = new Scheduler(); _dummyThreadManager = new TaskThreadManager(10); _dummyConfiguration = new CrawlConfiguration(); _dummyConfiguration.ConfigurationExtensions.Add("somekey", "someval"); _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.CrawlBag.SomeVal = "SomeVal"; _unitUnderTest.CrawlBag.SomeList = new List<string>() { "a", "b" }; _rootUri = new Uri("http://a.com/"); }
public void Crawl_PageCrawlCompletedEvent_IsSynchronous() { _dummyThreadManager = new TaskThreadManager(1); _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); int elapsedTimeForLongJob = 1000; _fakeHttpRequester.Setup(f => f.MakeRequest(It.IsAny<Uri>(), It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(new CrawledPage(_rootUri)); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == _rootUri))).Returns(new List<Uri>(){ new Uri(_rootUri.AbsoluteUri + "page2.html"), //should be fired sync new Uri(_rootUri.AbsoluteUri + "page3.html"), //should be fired sync new Uri(_rootUri.AbsoluteUri + "page4.html"), //should be fired sync new Uri(_rootUri.AbsoluteUri + "page5.html")}); //should be fired sync since its the last page to be crawled _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldRecrawlPage(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = false }); _unitUnderTest.PageCrawlCompleted += new EventHandler<PageCrawlCompletedArgs>((sender, args) => System.Threading.Thread.Sleep(elapsedTimeForLongJob)); Stopwatch timer = Stopwatch.StartNew(); _unitUnderTest.Crawl(_rootUri); timer.Stop(); Assert.IsTrue(timer.ElapsedMilliseconds > 4 * elapsedTimeForLongJob); }
public void Crawl_NotEnoughAvailableMemoryToStartTheCrawl_CrawlIsStoppedBeforeStarting() { _dummyConfiguration.MinAvailableMemoryRequiredInMb = int.MaxValue; _fakeMemoryManager.Setup(f => f.IsSpaceAvailable(It.IsAny<int>())).Returns(false); _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); CrawlResult result = _unitUnderTest.Crawl(_rootUri); Assert.AreEqual(1, _dummyScheduler.Count);//no need to clear the scheduler since the crawl was never started Assert.IsTrue(result.ErrorOccurred); Assert.IsTrue(result.ErrorException is InsufficientMemoryException); Assert.AreEqual("Process does not have the configured [2147483647mb] of available memory to crawl site [http://a.com/]. This is configurable through the minAvailableMemoryRequiredInMb in app.conf or CrawlConfiguration.MinAvailableMemoryRequiredInMb.", result.ErrorException.Message); Assert.IsFalse(result.CrawlContext.IsCrawlStopRequested); Assert.IsFalse(result.CrawlContext.IsCrawlHardStopRequested); }
public void Crawl_ExceptionThrownByScheduler_SetsCrawlResultError() { Mock<IScheduler> fakeScheduler = new Mock<IScheduler>(); Exception ex = new Exception("oh no"); fakeScheduler.Setup(f => f.Count).Throws(ex); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, fakeScheduler.Object, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); CrawlResult result = _unitUnderTest.Crawl(_rootUri); fakeScheduler.Verify(f => f.Count, Times.Exactly(1)); Assert.IsTrue(result.ErrorOccurred); Assert.AreSame(ex, result.ErrorException); }
public void Crawl_ExceptionThrownByFirstShouldSchedulePageLink_SetsCrawlResultError() { _dummyThreadManager = new TaskThreadManager(1); _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); Exception ex = new Exception("oh no"); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Throws(ex); CrawlResult result = _unitUnderTest.Crawl(_rootUri); _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(1)); Assert.IsTrue(result.ErrorOccurred); Assert.AreSame(ex, result.ErrorException); Assert.AreEqual(0, _dummyScheduler.Count); Assert.IsFalse(result.CrawlContext.IsCrawlStopRequested); Assert.IsFalse(result.CrawlContext.IsCrawlHardStopRequested); }
public void Crawl_CrawlHasExceededMaxMemoryUsageInMb_CrawlIsStoppedBeforeCompletion() { _dummyConfiguration.MaxMemoryUsageInMb = 1; _fakeMemoryManager.Setup(f => f.GetCurrentUsageInMb()).Returns(2); _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); CrawlResult result = _unitUnderTest.Crawl(_rootUri); _fakeMemoryManager.Verify(f => f.GetCurrentUsageInMb(), Times.Exactly(2)); _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(1)); Assert.AreEqual(0, _dummyScheduler.Count); Assert.IsTrue(result.ErrorOccurred); Assert.IsTrue(result.ErrorException is InsufficientMemoryException); Assert.AreEqual("Process is using [2mb] of memory which is above the max configured of [1mb] for site [http://a.com/]. This is configurable through the maxMemoryUsageInMb in app.conf or CrawlConfiguration.MaxMemoryUsageInMb.", result.ErrorException.Message); Assert.IsFalse(result.CrawlContext.IsCrawlStopRequested); Assert.IsTrue(result.CrawlContext.IsCrawlHardStopRequested); }
public void Constructor_ConfigValueMaxConcurrentThreadsIsZero_DoesNotThrowException() { _dummyConfiguration.MaxConcurrentThreads = 0; _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, null, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); }
public void Crawl_CanExtractRetryAfterTimeFromHeaders() { _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = false }); _fakeCrawlDecisionMaker.SetupSequence(f => f.ShouldRecrawlPage(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())) .Returns(new CrawlDecision { Allow = true }) .Returns(new CrawlDecision { Allow = false }); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.IsAny<CrawledPage>())).Returns(new List<Uri>()); CrawledPage page = new CrawledPage(_rootUri) { WebException = new WebException(), HttpWebResponse = new HttpWebResponseWrapper(HttpStatusCode.ServiceUnavailable, "", null, new WebHeaderCollection{ {"Retry-After", "1"} }) }; _fakeHttpRequester.Setup(f => f.MakeRequest(It.IsAny<Uri>(), It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page); Stopwatch watch = new Stopwatch(); watch.Start(); CrawlResult result = _unitUnderTest.Crawl(_rootUri); watch.Start(); Assert.That(watch.ElapsedMilliseconds, Is.GreaterThan(2000)); Assert.That(page.RetryAfter, Is.EqualTo(1.0)); _fakeCrawlDecisionMaker.Verify(f => f.ShouldRecrawlPage(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>()), Times.Exactly(2)); }
public void Crawl_PageCrawlCompletedAsyncEvent_IsAsynchronous() { _dummyThreadManager = new TaskThreadManager(1); _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); int elapsedTimeForLongJob = 2000; _fakeHttpRequester.Setup(f => f.MakeRequest(It.IsAny<Uri>(), It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(new CrawledPage(_rootUri)); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == _rootUri))).Returns(new List<Uri>(){ new Uri(_rootUri.AbsolutePath + "/page2.html"), //should be fired async new Uri(_rootUri.AbsolutePath + "/page3.html"), //should be fired async new Uri(_rootUri.AbsolutePath + "/page4.html"), //should be fired async new Uri(_rootUri.AbsolutePath + "/page5.html")}); //should be fired SYNC since its the last page to be crawled _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _unitUnderTest.PageCrawlCompletedAsync += new EventHandler<PageCrawlCompletedArgs>((sender, args) => System.Threading.Thread.Sleep(elapsedTimeForLongJob)); Stopwatch timer = Stopwatch.StartNew(); _unitUnderTest.Crawl(_rootUri); timer.Stop(); //The root uri and last page should be fired synchronously but all other async Assert.IsTrue(timer.ElapsedMilliseconds > 2 * elapsedTimeForLongJob); //Takes at least the time to process 1st and last page Assert.IsTrue(timer.ElapsedMilliseconds < 4 * elapsedTimeForLongJob); //Takes no more than the time to process 4 pages since some of them should have been processed asyc }
public void Crawl_ExtractedLinksAreNotCheckedTwice() { Uri fakeLink1 = new Uri("http://a.com/someUri"); Uri fakeLink2 = new Uri("http://a.com/someOtherUri"); Uri fakeLink3 = new Uri("http://a.com/anotherOne"); CrawledPage homePage = new CrawledPage(_rootUri); CrawledPage page1 = new CrawledPage(fakeLink1); CrawledPage page2 = new CrawledPage(fakeLink2); // All links are found in each pages. _fakeHyperLinkParser.Setup(parser => parser.GetLinks(It.IsAny<CrawledPage>())).Returns(new [] { fakeLink1, fakeLink2, fakeLink3 }); _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage); _fakeHttpRequester.Setup(f => f.MakeRequest(fakeLink1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1); _fakeHttpRequester.Setup(f => f.MakeRequest(fakeLink2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{Allow = true}); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.Is<PageToCrawl>(p => p.Uri == fakeLink3), It.IsAny<CrawlContext>())).Returns(new CrawlDecision{Allow = false}); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); // The links should be checked only one time, so ShouldCrawlPage should be called only 4 times. _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(4)); _fakeHyperLinkParser.VerifyAll(); _fakeCrawlDecisionMaker.VerifyAll(); }