Exemple #1
0
        public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            if(pageToCrawl == null)
                return new CrawlDecision { Allow = false, Reason = "Null page to crawl" };

            if (crawlContext == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawl context" };

            if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
                return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" };

            if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
                return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" };

            if (crawlContext.CrawledCount + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
            {
                return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) };
            }

            int pagesCrawledInThisDomain = 0;
            if (crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 &&
                crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) &&
                pagesCrawledInThisDomain > 0)
            {
                if(pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain)
                    return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) };
            }

            if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal)
                return new CrawlDecision { Allow = false, Reason = "Link is external" };

            return new CrawlDecision { Allow = true };
        }
        public PageCrawlStartingArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl)
            : base(crawlContext)
        {
            if (pageToCrawl == null)
                throw new ArgumentNullException("pageToCrawl");

            PageToCrawl = pageToCrawl;
        }
        public PageCrawlDisallowedArgs(CrawlContext crawlContext, PageToCrawl pageToCrawl, string disallowedReason)
            : base(crawlContext, pageToCrawl)
        {
            if (string.IsNullOrWhiteSpace(disallowedReason))
                throw new ArgumentNullException("disallowedReason");

            DisallowedReason = disallowedReason;
        }
Exemple #4
0
        public void SetUp()
        {
            _page = new PageToCrawl { Uri = new Uri("http://a.com/") };
            _pages = new List<PageToCrawl> { new PageToCrawl { Uri = new Uri("http://a.com/") }, new PageToCrawl { Uri = new Uri("http://b.com/") } };
            _fakeCrawledUrlRepo = new Mock<ICrawledUrlRepository>();
            _fakePagesToCrawlRepo = new Mock<IPagesToCrawlRepository>();

            _unitUnderTest = new Scheduler(false, _fakeCrawledUrlRepo.Object, _fakePagesToCrawlRepo.Object);
        }
 public void Constructor_ValidUri_CreatesInstance()
 {
     PageToCrawl unitUnderTest = new PageToCrawl(new Uri("http://a.com/"));
     Assert.AreEqual(false, unitUnderTest.IsRetry);
     Assert.AreEqual(false, unitUnderTest.IsRoot);
     Assert.AreEqual(false, unitUnderTest.IsInternal);
     Assert.AreEqual(null, unitUnderTest.ParentUri);
     Assert.AreEqual("http://a.com/", unitUnderTest.Uri.AbsoluteUri);
     Assert.AreEqual(0, unitUnderTest.CrawlDepth);
 }
Exemple #6
0
 public virtual PageToCrawl ConvertToPageToCrawl(LinkToCrawl link, int crawlerId)
 {
     var page = new PageToCrawl(new Uri(link.TargetUrl));
     page.PageBag.SessionId = link.SessionId;
     page.PageBag.CrawlerId = crawlerId;
     page.ParentUri = new Uri(link.SourceUrl);
     page.CrawlDepth = link.CrawlDepth;
     page.IsInternal = link.IsInternal;
     page.IsRoot = link.IsRoot;
     return page;
 }
Exemple #7
0
 public void Constructor_CreatesInstance()
 {
     PageToCrawl unitUnderTest = new PageToCrawl();
     Assert.AreEqual(false, unitUnderTest.IsRetry);
     Assert.AreEqual(false, unitUnderTest.IsRoot);
     Assert.AreEqual(false, unitUnderTest.IsInternal);
     Assert.AreEqual(null, unitUnderTest.ParentUri);
     Assert.IsNull(unitUnderTest.Uri);
     Assert.AreEqual(0, unitUnderTest.CrawlDepth);
     Assert.IsNull(unitUnderTest.PageBag);
 }
Exemple #8
0
 public virtual LinkToCrawl ConvertToLinkToCrawl(PageToCrawl page, int sessionId)
 {
     var link = new LinkToCrawl();
     link.SessionId = sessionId;
     link.SourceUrl = page.ParentUri.AbsoluteUri;
     link.TargetUrl = page.Uri.AbsoluteUri;
     link.TargetBaseDomain = page.Uri.GetBaseDomain();
     link.CrawlDepth = page.CrawlDepth;
     link.IsRoot = page.IsRoot;
     link.IsInternal = page.IsInternal;
     return link;
 }
		public static void Add(SchedulerState state, PageToCrawl page)
		{
			var json = JsonConvert.SerializeObject(page);
			var url = page.Uri.AbsoluteUri;
			var trans = CreateTransaction(state);
			var crawledPageKey = CrawledPageKey(state.SiteName, url);
			var pageToCrawlKey = PageToCrawlKey(state.SiteName);
			trans.AddCondition(Condition.KeyNotExists(crawledPageKey));
			trans.StringSetAsync(crawledPageKey, "");
			trans.ListLeftPushAsync(pageToCrawlKey, json);
			trans.ExecuteAsync().Wait();
		}
Exemple #10
0
        public void PageBag()
        {
            PageToCrawl unitUnderTest = new PageToCrawl(new Uri("http://a.com/"));
            unitUnderTest.PageBag.SomeVal = "someval";
            unitUnderTest.PageBag.SomeQueue = new Queue<string>();
            unitUnderTest.PageBag.SomeQueue.Enqueue("aaa");
            unitUnderTest.PageBag.SomeQueue.Enqueue("bbb");

            Assert.IsNotNull(unitUnderTest.PageBag);
            Assert.AreEqual("someval", unitUnderTest.PageBag.SomeVal);
            Assert.AreEqual("aaa", unitUnderTest.PageBag.SomeQueue.Dequeue());
            Assert.AreEqual("bbb", unitUnderTest.PageBag.SomeQueue.Dequeue());
        }
Exemple #11
0
        public void Add(PageToCrawl page)
        {
            if (page == null)
                throw new ArgumentNullException("page");

            if (_allowUriRecrawling || page.IsRetry)
            {
                _pagesToCrawlRepo.Add(page);
            }
            else
            {
                if (_crawledUrlRepo.AddIfNew(page.Uri))
                    _pagesToCrawlRepo.Add(page);
            }
        }
Exemple #12
0
        protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl)
        {
            bool allowedByRobots = true;
            if (_robotsDotText != null)
                allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);


            //https://github.com/sjdirect/abot/issues/96 Handle scenario where the root is allowed but all the paths below are disallowed like "disallow: /*"
            var allPathsBelowRootAllowedByRobots = false;
            if (_robotsDotText != null && pageToCrawl.IsRoot && allowedByRobots)
            {
                var anyPathOffRoot = pageToCrawl.Uri.AbsoluteUri.EndsWith("/") ? pageToCrawl.Uri.AbsoluteUri + "aaaaa": pageToCrawl.Uri.AbsoluteUri + "/aaaaa";
                allPathsBelowRootAllowedByRobots = _robotsDotText.IsUrlAllowed(anyPathOffRoot, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);
            }

            if (_crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled && pageToCrawl.IsRoot)    
            {
                if (!allowedByRobots)
                {
                    string message = string.Format("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
                    _logger.DebugFormat(message);
                    allowedByRobots = true;
                    _robotsDotText = null;
                }
                else if (!allPathsBelowRootAllowedByRobots)
                {
                    string message = string.Format("All Pages below [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
                    _logger.DebugFormat(message);
                    allowedByRobots = true;
                    _robotsDotText = null;
                }

            }
            else if (!allowedByRobots)
            {
                string message = string.Format("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri);
                _logger.DebugFormat(message);

                FirePageCrawlDisallowedEventAsync(pageToCrawl, message);
                FirePageCrawlDisallowedEvent(pageToCrawl, message);

                return false;
            }

            return allowedByRobots && base.ShouldCrawlPage(pageToCrawl);
        }
Exemple #13
0
        /// <summary>
        /// If this method is called, then it assumes some pre-logic for links to avoid has already
        /// been applied and that the <paramref name="page"/> should be stored for future crawling.
        /// </summary>
        /// <param name="page"></param>
        public void Add(PageToCrawl page)
        {
            if (page == null)
                throw new ArgumentNullException("page");

            //_logger.DebugFormat("Add(page): Target: {0}, Source: {1}, Root: {2}",
            //    page.Uri.AbsoluteUri,
            //    page.ParentUri.AbsoluteUri,
            //    page.IsRoot);

            page.PageBag.SessionId = SessionId;
            page.PageBag.CrawlerId = CrawlerId;
            using (var factory = _provider.GetInstanceOf<IModelFactory>())
            {
                var link = factory.ConvertToLinkToCrawl(page, SessionId);
                AddLinkToCrawl(link);
            }
        }
Exemple #14
0
        /// <summary>
        /// Schedules the param to be crawled in a FIFO fashion
        /// </summary>
        public void Add(PageToCrawl page)
        {
            if (page == null)
                throw new ArgumentNullException("page");

            if (_allowUriRecrawling)
            {
                //_logger.DebugFormat("Scheduling for crawl [{0}]", page.Uri.AbsoluteUri);
                _pagesToCrawl.Enqueue(page);
            }
            else
            {
                if (_scheduledOrCrawled.TryAdd(page.Uri.AbsoluteUri, null))
                {
                    //_logger.DebugFormat("Scheduling for crawl [{0}]", page.Uri.AbsoluteUri);
                    _pagesToCrawl.Enqueue(page);
                }
            }
        }
        public virtual CrawlDecision ShouldCrawlPage(PageToCrawl pageToCrawl, CrawlContext crawlContext)
        {
            if(pageToCrawl == null)
                return new CrawlDecision { Allow = false, Reason = "Null page to crawl" };

            if (crawlContext == null)
                return new CrawlDecision { Allow = false, Reason = "Null crawl context" };

            if (pageToCrawl.RedirectedFrom != null && pageToCrawl.RedirectPosition > crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects)
                return new CrawlDecision { Allow = false, Reason = string.Format("HttpRequestMaxAutoRedirects limit of [{0}] has been reached", crawlContext.CrawlConfiguration.HttpRequestMaxAutoRedirects) };

            if(pageToCrawl.CrawlDepth > crawlContext.CrawlConfiguration.MaxCrawlDepth)
                return new CrawlDecision { Allow = false, Reason = "Crawl depth is above max" };

            if (!pageToCrawl.Uri.Scheme.StartsWith("http"))
                return new CrawlDecision { Allow = false, Reason = "Scheme does not begin with http" };

            //TODO Do we want to ignore redirect chains (ie.. do not treat them as seperate page crawls)?
            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawl > 0 &&
                crawlContext.CrawledCount + crawlContext.Scheduler.Count + 1 > crawlContext.CrawlConfiguration.MaxPagesToCrawl)
            {
                return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawl limit of [{0}] has been reached", crawlContext.CrawlConfiguration.MaxPagesToCrawl) };
            }

            int pagesCrawledInThisDomain = 0;
            if (!pageToCrawl.IsRetry &&
                crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain > 0 &&
                crawlContext.CrawlCountByDomain.TryGetValue(pageToCrawl.Uri.Authority, out pagesCrawledInThisDomain) &&
                pagesCrawledInThisDomain > 0)
            {
                if (pagesCrawledInThisDomain >= crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain)
                    return new CrawlDecision { Allow = false, Reason = string.Format("MaxPagesToCrawlPerDomain limit of [{0}] has been reached for domain [{1}]", crawlContext.CrawlConfiguration.MaxPagesToCrawlPerDomain, pageToCrawl.Uri.Authority) };
            }

            if(!crawlContext.CrawlConfiguration.IsExternalPageCrawlingEnabled && !pageToCrawl.IsInternal)
                return new CrawlDecision { Allow = false, Reason = "Link is external" };

            return new CrawlDecision { Allow = true };
        }
        public void GetNext_MultiplePages_ReturnsInFifoOrder()
        {
            PageToCrawl page3 = new PageToCrawl(new Uri("http://abc/"));
            PageToCrawl page4 = new PageToCrawl(new Uri("http://abcd/"));
            
            _unitUnderTest.Add(_page1);
            _unitUnderTest.Add(_page2);
            _unitUnderTest.Add(page3);
            _unitUnderTest.Add(page4);

            PageToCrawl result1 = _unitUnderTest.GetNext();
            PageToCrawl result2 = _unitUnderTest.GetNext();
            PageToCrawl result3 = _unitUnderTest.GetNext();
            PageToCrawl result4 = _unitUnderTest.GetNext();
            PageToCrawl result5 = _unitUnderTest.GetNext();//should be null

            Assert.AreSame(_page1, result1);
            Assert.AreSame(_page2, result2);
            Assert.AreSame(page3, result3);
            Assert.AreSame(page4, result4);
            Assert.IsNull(result5);
        }
		protected override bool ShouldCrawlPage(PageToCrawl pageToCrawl)
		{
			var allowedByRobots = true;
			if (_robotsDotText != null)
				allowedByRobots = _robotsDotText.IsUrlAllowed(pageToCrawl.Uri.AbsoluteUri, _crawlContext.CrawlConfiguration.RobotsDotTextUserAgentString);

			if (!allowedByRobots && pageToCrawl.IsRoot && _crawlContext.CrawlConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled)
			{
				_logger.DebugFormat("Page [{0}] [Disallowed by robots.txt file], however since IsIgnoreRobotsDotTextIfRootDisallowedEnabled is set to true the robots.txt file will be ignored for this site.", pageToCrawl.Uri.AbsoluteUri);
				allowedByRobots = true;
				_robotsDotText = null;
			}
			else if (!allowedByRobots)
			{				
				_logger.DebugFormat("Page [{0}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.", pageToCrawl.Uri.AbsoluteUri);
				var message = $"Page [{pageToCrawl.Uri.AbsoluteUri}] not crawled, [Disallowed by robots.txt file], set IsRespectRobotsDotText=false in config file if you would like to ignore robots.txt files.";
				FirePageCrawlDisallowedEventAsync(pageToCrawl, message);
				FirePageCrawlDisallowedEvent(pageToCrawl, message);

				return false;
			}

			return base.ShouldCrawlPage(pageToCrawl);
		}
Exemple #18
0
 protected virtual void FirePageCrawlStartingEventAsync(PageToCrawl pageToCrawl)
 {
     EventHandler<PageCrawlStartingArgs> threadSafeEvent = PageCrawlStartingAsync;
     if (threadSafeEvent != null)
     {
         //Fire each subscribers delegate async
         foreach (EventHandler<PageCrawlStartingArgs> del in threadSafeEvent.GetInvocationList())
         {
             del.BeginInvoke(this, new PageCrawlStartingArgs(_crawlContext, pageToCrawl), null, null);
         }
     }
 }
Exemple #19
0
        public void GetNext()
        {
            Assert.AreEqual(0, _unitUnderTest.Count);

            PageToCrawl page1 = new PageToCrawl(new Uri("http://a.com/1"));
            PageToCrawl page2 = new PageToCrawl(new Uri("http://a.com/2"));
            PageToCrawl page3 = new PageToCrawl(new Uri("http://a.com/3"));

            _unitUnderTest.Add(page1);
            _unitUnderTest.Add(page2);
            _unitUnderTest.Add(page3);

            Assert.AreEqual(3, _unitUnderTest.Count);
            Assert.AreEqual(page1.Uri, _unitUnderTest.GetNext().Uri);
            Assert.AreEqual(page2.Uri, _unitUnderTest.GetNext().Uri);
            Assert.AreEqual(page3.Uri, _unitUnderTest.GetNext().Uri);
            Assert.AreEqual(0, _unitUnderTest.Count);
        }
Exemple #20
0
 public PageToCrawl GetNext()
 {
     int rnd = (int)((rand1.NextDouble() * (maxRandom - minRandom)) + minRandom);//296030
     Uri tempUri = new Uri("http://us.ebid.net/for-sale/a-" + rnd.ToString() + ".htm");
     //Uri tempUri = new Uri("http://us.ebid.net/for-sale/a-141378296.htm");
     while (_crawledUrlRepo.Contains(tempUri))
     {
         rnd = (int)((rand1.NextDouble() * (maxRandom - minRandom)) + minRandom);
         tempUri = new Uri("http://us.ebid.net/for-sale/a-" + rnd.ToString()+".htm");
     }
     count--;
     PageToCrawl page = new PageToCrawl(tempUri);
     page.ParentUri = new Uri("http://us.ebid.net/");
     page.CrawlDepth = 1;
     page.IsInternal = true;
     page.IsRoot = false;
     return page;
 }
Exemple #21
0
 public void Add(PageToCrawl page)
 {
     if (page == null)
         throw new ArgumentNullException("page");
     //throw new System.InvalidOperationException("dont use this method!");
 }
		public void Add(PageToCrawl page)
		{			
			SchedulerFunc.Add(_state, page);
		}
Exemple #23
0
        protected virtual bool ShouldCrawlPage(PageToCrawl pageToCrawl)
        {
            CrawlDecision shouldCrawlPageDecision = _crawlDecisionMaker.ShouldCrawlPage(pageToCrawl, _crawlContext);
            if (shouldCrawlPageDecision.Allow)
                shouldCrawlPageDecision = (_shouldCrawlPageDecisionMaker != null) ? _shouldCrawlPageDecisionMaker.Invoke(pageToCrawl, _crawlContext) : new CrawlDecision { Allow = true };

            if (shouldCrawlPageDecision.Allow)
            {
                AddPageToContext(pageToCrawl);
            }
            else
            {
                _logger.DebugFormat("Page [{0}] not crawled, [{1}]", pageToCrawl.Uri.AbsoluteUri, shouldCrawlPageDecision.Reason);
                FirePageCrawlDisallowedEventAsync(pageToCrawl, shouldCrawlPageDecision.Reason);
                FirePageCrawlDisallowedEvent(pageToCrawl, shouldCrawlPageDecision.Reason);
            }

            return shouldCrawlPageDecision.Allow;
        }
Exemple #24
0
        public void Crawl_CancellationRequestedThroughCrawlDecisionCall_CrawlIsStoppedBeforeCompletion()
        {
            //Arrange
            CancellationTokenSource cancellationTokenSource = new CancellationTokenSource();
            PageToCrawl pageToReturn = new PageToCrawl(_rootUri);
            for (int i = 0; i < 100; i++)
                _dummyScheduler.Add(pageToReturn);

            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()))
            .Callback<PageToCrawl, CrawlContext>((p, c) =>
            {
                c.CancellationTokenSource.Cancel();
                System.Threading.Thread.Sleep(500);
            })
            .Returns(new CrawlDecision { Allow = false, Reason = "Should have timed out so this crawl decision doesn't matter." });

            //Act
            CrawlResult result = _unitUnderTest.Crawl(_rootUri, cancellationTokenSource);

            //Assert
            Assert.AreEqual(0, _dummyScheduler.Count);
            Assert.IsFalse(result.CrawlContext.IsCrawlStopRequested);
            Assert.IsTrue(result.CrawlContext.IsCrawlHardStopRequested);
            Assert.IsTrue(result.CrawlContext.CancellationTokenSource.IsCancellationRequested);
        }
Exemple #25
0
 protected virtual void FirePageCrawlStartingEvent(PageToCrawl pageToCrawl)
 {
     try
     {
         EventHandler<PageCrawlStartingArgs> threadSafeEvent = PageCrawlStarting;
         if (threadSafeEvent != null)
             threadSafeEvent(this, new PageCrawlStartingArgs(_crawlContext, pageToCrawl));
     }
     catch (Exception e)
     {
         _logger.Error("An unhandled exception was thrown by a subscriber of the PageCrawlStarting event for url:" + pageToCrawl.Uri.AbsoluteUri);
         _logger.Error(e);
     }
 }
Exemple #26
0
        public void Crawl_HardStopRequested_CrawlIsStoppedBeforeCompletion()
        {
            //Arrange
            PageToCrawl pageToReturn = new PageToCrawl(_rootUri);
            for (int i = 0; i < 100; i++)
                _dummyScheduler.Add(pageToReturn);

            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });
            _fakeHttpRequester.Setup(f => f.MakeRequest(It.IsAny<Uri>(), It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(new CrawledPage(_rootUri));
            _unitUnderTest.PageCrawlStarting += (e, a) =>
            {
                a.CrawlContext.IsCrawlHardStopRequested = true;
                System.Threading.Thread.Sleep(500);
            };

            //Act
            CrawlResult result = _unitUnderTest.Crawl(_rootUri);

            //Assert
            _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(1));
            _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>()), Times.AtMost(1));
            Assert.AreEqual(0, _dummyScheduler.Count);
            Assert.IsFalse(result.CrawlContext.IsCrawlStopRequested);
            Assert.IsTrue(result.CrawlContext.IsCrawlHardStopRequested);
        }
Exemple #27
0
        public void Crawl_CancellationRequested_CrawlIsStoppedBeforeCompletion()
        {
            //Arrange
            CancellationTokenSource cancellationTokenSource = new CancellationTokenSource();
            System.Timers.Timer timer = new System.Timers.Timer(10);
            timer.Elapsed += (o, e) =>
            {
                cancellationTokenSource.Cancel();
                timer.Stop();
                timer.Dispose();
            };
            timer.Start();

            PageToCrawl pageToReturn = new PageToCrawl(_rootUri);
            for (int i = 0; i < 100; i++)
                _dummyScheduler.Add(pageToReturn);

            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true });

            //Act
            CrawlResult result = _unitUnderTest.Crawl(_rootUri, cancellationTokenSource);

            System.Threading.Thread.Sleep(30);

            //Assert
            _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(1));
            Assert.AreEqual(0, _dummyScheduler.Count);
            Assert.IsFalse(result.CrawlContext.IsCrawlStopRequested);
            Assert.IsTrue(result.CrawlContext.IsCrawlHardStopRequested);
            Assert.IsTrue(result.CrawlContext.CancellationTokenSource.IsCancellationRequested);
        }
Exemple #28
0
        protected virtual void ProcessPage(PageToCrawl pageToCrawl)
        {
            try
            {
                if (pageToCrawl == null)
                    return;

                if (!ShouldCrawlPage(pageToCrawl))
                    return;

                CrawledPage crawledPage = CrawlThePage(pageToCrawl);

                if (PageSizeIsAboveMax(crawledPage))
                    return;

                FirePageCrawlCompletedEventAsync(crawledPage);
                FirePageCrawlCompletedEvent(crawledPage);

                if (ShouldCrawlPageLinks(crawledPage))
                    SchedulePageLinks(crawledPage);
            }
            catch(Exception e)
            {
                _crawlResult.ErrorException = e;
                _logger.FatalFormat("Error occurred during processing of page [{0}]", pageToCrawl.Uri);
                _logger.Fatal(e);

                _crawlContext.IsCrawlHardStopRequested = true;
            }
        }
Exemple #29
0
        public void Crawl_OverCrawlTimeoutSeconds_CrawlIsStoppedBeforeCompletion()
        {
            _dummyConfiguration.CrawlTimeoutSeconds = 1;

            PageToCrawl pageToReturn = new PageToCrawl(_rootUri);
            CrawledPage crawledPage = new CrawledPage(_rootUri) { ParentUri = _rootUri };

            for (int i = 0; i < 100; i++)
                _dummyScheduler.Add(pageToReturn);

            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()))
                .Callback(() => System.Threading.Thread.Sleep(2000))
                .Returns(new CrawlDecision { Allow = true });
            _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = false });
            _fakeHttpRequester.Setup(f => f.MakeRequest(It.IsAny<Uri>(), It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(crawledPage);

            CrawlResult result = _unitUnderTest.Crawl(_rootUri);

            _fakeCrawlDecisionMaker.Verify(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>()), Times.Exactly(1));
            Assert.AreEqual(0, _dummyScheduler.Count);
            Assert.IsFalse(result.CrawlContext.IsCrawlStopRequested);
            Assert.IsTrue(result.CrawlContext.IsCrawlHardStopRequested);
        }
Exemple #30
0
        protected virtual CrawledPage CrawlThePage(PageToCrawl pageToCrawl)
        {
            _logger.DebugFormat("About to crawl page [{0}]", pageToCrawl.Uri.AbsoluteUri);
            FirePageCrawlStartingEventAsync(pageToCrawl);
            FirePageCrawlStartingEvent(pageToCrawl);

            CrawledPage crawledPage = _httpRequester.MakeRequest(pageToCrawl.Uri, (x) => ShouldDownloadPageContentWrapper(x));
            AutoMapper.Mapper.CreateMap<PageToCrawl, CrawledPage>();
            AutoMapper.Mapper.Map(pageToCrawl, crawledPage);

            if (crawledPage.HttpWebResponse == null)
                _logger.InfoFormat("Page crawl complete, Status:[NA] Url:[{0}] Parent:[{1}]", crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri);
            else
                _logger.InfoFormat("Page crawl complete, Status:[{0}] Url:[{1}] Parent:[{2}]", Convert.ToInt32(crawledPage.HttpWebResponse.StatusCode), crawledPage.Uri.AbsoluteUri, crawledPage.ParentUri);

            return crawledPage;
        }