public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if(pageToCrawl == null) return new CrawlDecision { Allow = false, Reason = "Null page to crawl" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth) return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" }; if (!pageToCrawl.Uri.Scheme.StartsWith("http")) return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" }; if (crawlContext.CrawledCount + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl) { return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) }; } int pagesCrawledInThisDomain = 0; if (crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 && crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) && pagesCrawledInThisDomain > 0) { if(pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain) return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) }; } if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal) return new CrawlDecision { Allow = false, Reason = "Link is external" }; return new CrawlDecision { Allow = true }; }
public PageCrawlStartingArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl) : base(crawlContext) { if (pageToCrawl == null) throw new ArgumentNullException("pageToCrawl"); PageToCrawl = pageToCrawl; }
public PageCrawlDisallowedArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl, string disallowedReason) : base(crawlContext, pageToCrawl) { if (string.IsNullOrWhiteSpace(disallowedReason)) throw new ArgumentNullException("disallowedReason"); DisallowedReason = disallowedReason; }
public void SetUp() { _page = new PageToCrawl { Uri = new Uri("http://a.com/") }; _pages = new List<PageToCrawl> { new PageToCrawl { Uri = new Uri("http://a.com/") }, new PageToCrawl { Uri = new Uri("http://b.com/") } }; _fakeCrawledUrlRepo = new Mock<ICrawledUrlRepository>(); _fakePagesToCrawlRepo = new Mock<IPagesToCrawlRepository>(); _unitUnderTest = new Scheduler(false, _fakeCrawledUrlRepo.Object, _fakePagesToCrawlRepo.Object); }
public void Constructor_ValidUri_CreatesInstance() { PageToCrawl unitUnderTest = new PageToCrawl(new Uri("http://a.com/")); Assert.AreEqual(false, unitUnderTest.IsRetry); Assert.AreEqual(false, unitUnderTest.IsRoot); Assert.AreEqual(false, unitUnderTest.IsInternal); Assert.AreEqual(null, unitUnderTest.ParentUri); Assert.AreEqual("http://a.com/", unitUnderTest.Uri.AbsoluteUri); Assert.AreEqual(0, unitUnderTest.CrawlDepth); }
public virtual PageToCrawl ConvertToPageToCrawl(LinkToCrawl link, int crawlerId) { var page = new PageToCrawl(new Uri(link.TargetUrl)); page.PageBag.SessionId = link.SessionId; page.PageBag.CrawlerId = crawlerId; page.ParentUri = new Uri(link.SourceUrl); page.CrawlDepth = link.CrawlDepth; page.IsInternal = link.IsInternal; page.IsRoot = link.IsRoot; return page; }
public void Constructor_CreatesInstance() { PageToCrawl unitUnderTest = new PageToCrawl(); Assert.AreEqual(false, unitUnderTest.IsRetry); Assert.AreEqual(false, unitUnderTest.IsRoot); Assert.AreEqual(false, unitUnderTest.IsInternal); Assert.AreEqual(null, unitUnderTest.ParentUri); Assert.IsNull(unitUnderTest.Uri); Assert.AreEqual(0, unitUnderTest.CrawlDepth); Assert.IsNull(unitUnderTest.PageBag); }
public virtual LinkToCrawl ConvertToLinkToCrawl(PageToCrawl page, int sessionId) { var link = new LinkToCrawl(); link.SessionId = sessionId; link.SourceUrl = page.ParentUri.AbsoluteUri; link.TargetUrl = page.Uri.AbsoluteUri; link.TargetBaseDomain = page.Uri.GetBaseDomain(); link.CrawlDepth = page.CrawlDepth; link.IsRoot = page.IsRoot; link.IsInternal = page.IsInternal; return link; }
public static void Add(SchedulerState state, PageToCrawl page) { var json = JsonConvert.SerializeObject(page); var url = page.Uri.AbsoluteUri; var trans = CreateTransaction(state); var crawledPageKey = CrawledPageKey(state.SiteName, url); var pageToCrawlKey = PageToCrawlKey(state.SiteName); trans.AddCondition(Condition.KeyNotExists(crawledPageKey)); trans.StringSetAsync(crawledPageKey, ""); trans.ListLeftPushAsync(pageToCrawlKey, json); trans.ExecuteAsync().Wait(); }
public void PageBag() { PageToCrawl unitUnderTest = new PageToCrawl(new Uri("http://a.com/")); unitUnderTest.PageBag.SomeVal = "someval"; unitUnderTest.PageBag.SomeQueue = new Queue<string>(); unitUnderTest.PageBag.SomeQueue.Enqueue("aaa"); unitUnderTest.PageBag.SomeQueue.Enqueue("bbb"); Assert.IsNotNull(unitUnderTest.PageBag); Assert.AreEqual("someval", unitUnderTest.PageBag.SomeVal); Assert.AreEqual("aaa", unitUnderTest.PageBag.SomeQueue.Dequeue()); Assert.AreEqual("bbb", unitUnderTest.PageBag.SomeQueue.Dequeue()); }
public void Add(PageToCrawl page) { if (page == null) throw new ArgumentNullException("page"); if (_allowUriRecrawling || page.IsRetry) { _pagesToCrawlRepo.Add(page); } else { if (_crawledUrlRepo.AddIfNew(page.Uri)) _pagesToCrawlRepo.Add(page); } }
protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl) { bool allowedByRobots = true; if (_robotsDotText != null) allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); //https://github.com/sjdirect/abot/issues/96 Handle scenario where the root is allowed but all the paths below are disallowed like "disallow: /*" var allPathsBelowRootAllowedByRobots = false; if (_robotsDotText != null && pageToCrawl.IsRoot && allowedByRobots) { var anyPathOffRoot = pageToCrawl.Uri.AbsoluteUri.EndsWith("/") ? pageToCrawl.Uri.AbsoluteUri + "aaaaa": pageToCrawl.Uri.AbsoluteUri + "/aaaaa"; allPathsBelowRootAllowedByRobots = _robotsDotText.IsUrlAllowed(anyPathOffRoot, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); } if (_crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled && pageToCrawl.IsRoot) { if (!allowedByRobots) { string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri); _logger.DebugFormat(message); allowedByRobots = true; _robotsDotText = null; } else if (!allPathsBelowRootAllowedByRobots) { string message = string.Format("All Pages below [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri); _logger.DebugFormat(message); allowedByRobots = true; _robotsDotText = null; } } else if (!allowedByRobots) { string message = string.Format("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri); _logger.DebugFormat(message); FirePageCrawlDisallowedEventAsync(pageToCrawl, message); FirePageCrawlDisallowedEvent(pageToCrawl, message); return false; } return allowedByRobots && base.ShouldCrawlPage(pageToCrawl); }
/// <summary> /// If this method is called, then it assumes some pre-logic for links to avoid has already /// been applied and that the <paramref name="page"/> should be stored for future crawling. /// </summary> /// <param name="page"></param> public void Add(PageToCrawl page) { if (page == null) throw new ArgumentNullException("page"); //_logger.DebugFormat("Add(page): Target: {0}, Source: {1}, Root: {2}", // page.Uri.AbsoluteUri, // page.ParentUri.AbsoluteUri, // page.IsRoot); page.PageBag.SessionId = SessionId; page.PageBag.CrawlerId = CrawlerId; using (var factory = _provider.GetInstanceOf<IModelFactory>()) { var link = factory.ConvertToLinkToCrawl(page, SessionId); AddLinkToCrawl(link); } }
/// <summary> /// Schedules the param to be crawled in a FIFO fashion /// </summary> public void Add(PageToCrawl page) { if (page == null) throw new ArgumentNullException("page"); if (_allowUriRecrawling) { //_logger.DebugFormat("Scheduling for crawl [{0}]", page.Uri.AbsoluteUri); _pagesToCrawl.Enqueue(page); } else { if (_scheduledOrCrawled.TryAdd(page.Uri.AbsoluteUri, null)) { //_logger.DebugFormat("Scheduling for crawl [{0}]", page.Uri.AbsoluteUri); _pagesToCrawl.Enqueue(page); } } }
public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext) { if(pageToCrawl == null) return new CrawlDecision { Allow = false, Reason = "Null page to crawl" }; if (crawlContext == null) return new CrawlDecision { Allow = false, Reason = "Null crawl context" }; if (pageToCrawl.RedirectedFrom != null && pageToCrawl.RedirectPosition > crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) return new CrawlDecision { Allow = false, Reason = string.Format("HttpRequestMaxAutoRedirects limit of [{0}] has been reached", crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) }; if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth) return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" }; if (!pageToCrawl.Uri.Scheme.StartsWith("http")) return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" }; //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)? if (!pageToCrawl.IsRetry && crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 && crawlContext.CrawledCount + crawlContext.Scheduler.Count + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl) { return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) }; } int pagesCrawledInThisDomain = 0; if (!pageToCrawl.IsRetry && crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 && crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) && pagesCrawledInThisDomain > 0) { if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain) return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) }; } if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal) return new CrawlDecision { Allow = false, Reason = "Link is external" }; return new CrawlDecision { Allow = true }; }
public void GetNext_MultiplePages_ReturnsInFifoOrder() { PageToCrawl page3 = new PageToCrawl(new Uri("http://abc/")); PageToCrawl page4 = new PageToCrawl(new Uri("http://abcd/")); _unitUnderTest.Add(_page1); _unitUnderTest.Add(_page2); _unitUnderTest.Add(page3); _unitUnderTest.Add(page4); PageToCrawl result1 = _unitUnderTest.GetNext(); PageToCrawl result2 = _unitUnderTest.GetNext(); PageToCrawl result3 = _unitUnderTest.GetNext(); PageToCrawl result4 = _unitUnderTest.GetNext(); PageToCrawl result5 = _unitUnderTest.GetNext();//should be null Assert.AreSame(_page1, result1); Assert.AreSame(_page2, result2); Assert.AreSame(page3, result3); Assert.AreSame(page4, result4); Assert.IsNull(result5); }
protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl) { var allowedByRobots = true; if (_robotsDotText != null) allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString); if (!allowedByRobots && pageToCrawl.IsRoot && _crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled) { _logger.DebugFormat("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri); allowedByRobots = true; _robotsDotText = null; } else if (!allowedByRobots) { _logger.DebugFormat("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri); var message = $"Page [{pageToCrawl.Uri.AbsoluteUri}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files."; FirePageCrawlDisallowedEventAsync(pageToCrawl, message); FirePageCrawlDisallowedEvent(pageToCrawl, message); return false; } return base.ShouldCrawlPage(pageToCrawl); }
protected virtual void FirePageCrawlStartingEventAsync(PageToCrawl pageToCrawl) { EventHandler<PageCrawlStartingArgs> threadSafeEvent = PageCrawlStartingAsync; if (threadSafeEvent != null) { //Fire each subscribers delegate async foreach (EventHandler<PageCrawlStartingArgs> del in threadSafeEvent.GetInvocationList()) { del.BeginInvoke(this, new PageCrawlStartingArgs(_crawlContext, pageToCrawl), null, null); } } }
public void GetNext() { Assert.AreEqual(0, _unitUnderTest.Count); PageToCrawl page1 = new PageToCrawl(new Uri("http://a.com/1")); PageToCrawl page2 = new PageToCrawl(new Uri("http://a.com/2")); PageToCrawl page3 = new PageToCrawl(new Uri("http://a.com/3")); _unitUnderTest.Add(page1); _unitUnderTest.Add(page2); _unitUnderTest.Add(page3); Assert.AreEqual(3, _unitUnderTest.Count); Assert.AreEqual(page1.Uri, _unitUnderTest.GetNext().Uri); Assert.AreEqual(page2.Uri, _unitUnderTest.GetNext().Uri); Assert.AreEqual(page3.Uri, _unitUnderTest.GetNext().Uri); Assert.AreEqual(0, _unitUnderTest.Count); }
public PageToCrawl GetNext() { int rnd = (int)((rand1.NextDouble() * (maxRandom - minRandom)) + minRandom);//296030 Uri tempUri = new Uri("http://us.ebid.net/for-sale/a-" + rnd.ToString() + ".htm"); //Uri tempUri = new Uri("http://us.ebid.net/for-sale/a-141378296.htm"); while (_crawledUrlRepo.Contains(tempUri)) { rnd = (int)((rand1.NextDouble() * (maxRandom - minRandom)) + minRandom); tempUri = new Uri("http://us.ebid.net/for-sale/a-" + rnd.ToString()+".htm"); } count--; PageToCrawl page = new PageToCrawl(tempUri); page.ParentUri = new Uri("http://us.ebid.net/"); page.CrawlDepth = 1; page.IsInternal = true; page.IsRoot = false; return page; }
public void Add(PageToCrawl page) { if (page == null) throw new ArgumentNullException("page"); //throw new System.InvalidOperationException("dont use this method!"); }
public void Add(PageToCrawl page) { SchedulerFunc.Add(_state, page); }
protected virtual bool ShouldCrawlPage(PageToCrawl pageToCrawl) { CrawlDecision shouldCrawlPageDecision = _crawlDecisionMaker.ShouldCrawlPage(pageToCrawl, _crawlContext); if (shouldCrawlPageDecision.Allow) shouldCrawlPageDecision = (_shouldCrawlPageDecisionMaker != null) ? _shouldCrawlPageDecisionMaker.Invoke(pageToCrawl, _crawlContext) : new CrawlDecision { Allow = true }; if (shouldCrawlPageDecision.Allow) { AddPageToContext(pageToCrawl); } else { _logger.DebugFormat("Page [{0}] not crawled, [{1}]", pageToCrawl.Uri.AbsoluteUri, shouldCrawlPageDecision.Reason); FirePageCrawlDisallowedEventAsync(pageToCrawl, shouldCrawlPageDecision.Reason); FirePageCrawlDisallowedEvent(pageToCrawl, shouldCrawlPageDecision.Reason); } return shouldCrawlPageDecision.Allow; }
public void Crawl_CancellationRequestedThroughCrawlDecisionCall_CrawlIsStoppedBeforeCompletion() { //Arrange CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(); PageToCrawl pageToReturn = new PageToCrawl(_rootUri); for (int i = 0; i < 100; i++) _dummyScheduler.Add(pageToReturn); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())) .Callback<PageToCrawl, CrawlContext>((p, c) => { c.CancellationTokenSource.Cancel(); System.Threading.Thread.Sleep(500); }) .Returns(new CrawlDecision { Allow = false, Reason = "Should have timed out so this crawl decision doesn't matter." }); //Act CrawlResult result = _unitUnderTest.Crawl(_rootUri, cancellationTokenSource); //Assert Assert.AreEqual(0, _dummyScheduler.Count); Assert.IsFalse(result.CrawlContext.IsCrawlStopRequested); Assert.IsTrue(result.CrawlContext.IsCrawlHardStopRequested); Assert.IsTrue(result.CrawlContext.CancellationTokenSource.IsCancellationRequested); }
protected virtual void FirePageCrawlStartingEvent(PageToCrawl pageToCrawl) { try { EventHandler<PageCrawlStartingArgs> threadSafeEvent = PageCrawlStarting; if (threadSafeEvent != null) threadSafeEvent(this, new PageCrawlStartingArgs(_crawlContext, pageToCrawl)); } catch (Exception e) { _logger.Error("An unhandled exception was thrown by a subscriber of the PageCrawlStarting event for url:" + pageToCrawl.Uri.AbsoluteUri); _logger.Error(e); } }
public void Crawl_HardStopRequested_CrawlIsStoppedBeforeCompletion() { //Arrange PageToCrawl pageToReturn = new PageToCrawl(_rootUri); for (int i = 0; i < 100; i++) _dummyScheduler.Add(pageToReturn); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeHttpRequester.Setup(f => f.MakeRequest(It.IsAny<Uri>(), It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(new CrawledPage(_rootUri)); _unitUnderTest.PageCrawlStarting += (e, a) => { a.CrawlContext.IsCrawlHardStopRequested = true; System.Threading.Thread.Sleep(500); }; //Act CrawlResult result = _unitUnderTest.Crawl(_rootUri); //Assert _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(1)); _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>()), Times.AtMost(1)); Assert.AreEqual(0, _dummyScheduler.Count); Assert.IsFalse(result.CrawlContext.IsCrawlStopRequested); Assert.IsTrue(result.CrawlContext.IsCrawlHardStopRequested); }
public void Crawl_CancellationRequested_CrawlIsStoppedBeforeCompletion() { //Arrange CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(); System.Timers.Timer timer = new System.Timers.Timer(10); timer.Elapsed += (o, e) => { cancellationTokenSource.Cancel(); timer.Stop(); timer.Dispose(); }; timer.Start(); PageToCrawl pageToReturn = new PageToCrawl(_rootUri); for (int i = 0; i < 100; i++) _dummyScheduler.Add(pageToReturn); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); //Act CrawlResult result = _unitUnderTest.Crawl(_rootUri, cancellationTokenSource); System.Threading.Thread.Sleep(30); //Assert _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(1)); Assert.AreEqual(0, _dummyScheduler.Count); Assert.IsFalse(result.CrawlContext.IsCrawlStopRequested); Assert.IsTrue(result.CrawlContext.IsCrawlHardStopRequested); Assert.IsTrue(result.CrawlContext.CancellationTokenSource.IsCancellationRequested); }
protected virtual void ProcessPage(PageToCrawl pageToCrawl) { try { if (pageToCrawl == null) return; if (!ShouldCrawlPage(pageToCrawl)) return; CrawledPage crawledPage = CrawlThePage(pageToCrawl); if (PageSizeIsAboveMax(crawledPage)) return; FirePageCrawlCompletedEventAsync(crawledPage); FirePageCrawlCompletedEvent(crawledPage); if (ShouldCrawlPageLinks(crawledPage)) SchedulePageLinks(crawledPage); } catch(Exception e) { _crawlResult.ErrorException = e; _logger.FatalFormat("Error occurred during processing of page [{0}]", pageToCrawl.Uri); _logger.Fatal(e); _crawlContext.IsCrawlHardStopRequested = true; } }
public void Crawl_OverCrawlTimeoutSeconds_CrawlIsStoppedBeforeCompletion() { _dummyConfiguration.CrawlTimeoutSeconds = 1; PageToCrawl pageToReturn = new PageToCrawl(_rootUri); CrawledPage crawledPage = new CrawledPage(_rootUri) { ParentUri = _rootUri }; for (int i = 0; i < 100; i++) _dummyScheduler.Add(pageToReturn); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())) .Callback(() => System.Threading.Thread.Sleep(2000)) .Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = false }); _fakeHttpRequester.Setup(f => f.MakeRequest(It.IsAny<Uri>(), It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(crawledPage); CrawlResult result = _unitUnderTest.Crawl(_rootUri); _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(1)); Assert.AreEqual(0, _dummyScheduler.Count); Assert.IsFalse(result.CrawlContext.IsCrawlStopRequested); Assert.IsTrue(result.CrawlContext.IsCrawlHardStopRequested); }
protected virtual CrawledPage CrawlThePage(PageToCrawl pageToCrawl) { _logger.DebugFormat("About to crawl page [{0}]", pageToCrawl.Uri.AbsoluteUri); FirePageCrawlStartingEventAsync(pageToCrawl); FirePageCrawlStartingEvent(pageToCrawl); CrawledPage crawledPage = _httpRequester.MakeRequest(pageToCrawl.Uri, (x) => ShouldDownloadPageContentWrapper(x)); AutoMapper.Mapper.CreateMap<PageToCrawl, CrawledPage>(); AutoMapper.Mapper.Map(pageToCrawl, crawledPage); if (crawledPage.HttpWebResponse == null) _logger.InfoFormat("Page crawl complete, Status:[NA] Url:[{0}] Parent:[{1}]", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri); else _logger.InfoFormat("Page crawl complete, Status:[{0}] Url:[{1}] Parent:[{2}]", Convert.ToInt32(crawledPage.HttpWebResponse.StatusCode), crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri); return crawledPage; }