protected void RunConsumer(CancellationTokenSource cancellationToken)
        {
            foreach (Domain domain in _domainsToCrawl.GetConsumingEnumerable())
            {
                DomainCrawlResult domainCrawlResult = null;

                try
                {
                    domainCrawlResult = _domainConsumer.Consume(domain, cancellationToken);
                }
                catch (Exception e)
                {
                    _logger.ErrorFormat("Domain consumer [{0}] threw exception during Consume().", _domainConsumer.ToString());
                    _logger.Error(e);
                    Stop();
                }

                if (domainCrawlResult != null)
                {
                    lock (_crawledDomains)
                    {
                        _crawledDomains.Add(domainCrawlResult);
                    }
                }

                if (cancellationToken.IsCancellationRequested)
                {
                    cancellationToken.Token.ThrowIfCancellationRequested();
                }
            }
        }
Пример #2
0
        public void Constructor()
        {
            DomainCrawlResult uut = new DomainCrawlResult();

            Assert.IsNull(uut.CrawlResult);
            Assert.IsNull(uut.Domain);
        }
Пример #3
0
        public void Consume_ValidDomain_CrawlerCrawlBagSet()
        {
            //Arrange
            Domain domain = new Domain {
                DomainId = 1, Uri = new Uri("http://a.com")
            };
            CrawlContext context    = GetCrawlContext(_dummyCrawlProcessors);
            CrawlResult  fakeResult = new CrawlResult {
                CrawlContext = context
            };

            _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object);
            _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Returns(fakeResult);
            _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors);

            //Act
            DomainCrawlResult result = _uut.Consume(domain, _dummyCancellationToken);

            //Assert
            _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1));
            _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1));
            _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1));

            Assert.AreEqual(domain, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.Domain);
            Assert.AreEqual(_dummyProcessorContext.PrimaryPersistenceProvider, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.PrimaryPersistenceProvider);
            Assert.AreEqual(_dummyProcessorContext.BackupPersistenceProvider, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.BackupPersistenceProvider);
            Assert.AreEqual(_dummyCrawlProcessors, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.CrawlProcessors);
        }
Пример #4
0
        public void Consume_CrawlerThrowsException_PageAndDomainNotProcessed()
        {
            //Arrange
            Exception ex = new Exception("oh no");

            _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object);
            _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Throws(ex);
            _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors);

            //Act
            DomainCrawlResult result = _uut.Consume(new Domain {
                DomainId = 1, Uri = new Uri("http://a.com")
            }, _dummyCancellationToken);

            //Assert
            _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1));
            _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1));
            _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1));

            Assert.IsTrue(result.CrawlResult.ErrorOccurred);
            Assert.AreEqual(ex, result.CrawlResult.ErrorException);

            _fakeProcessor1.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(0));
            _fakeProcessor2.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(0));
            _fakeProcessor3.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(0));

            _fakeProcessor1.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(0));
            _fakeProcessor1.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(0));
            _fakeProcessor1.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(0));
        }
Пример #5
0
        public DomainCrawlResult Consume(Domain domain, CancellationTokenSource cancellationToken)
        {
            if (domain == null)
            {
                throw new ArgumentNullException("domain");
            }

            if (cancellationToken == null)
            {
                throw new ArgumentNullException("cancellationToken");
            }

            IEnumerable <ICrawlProcessor> processors = _processorProvider.GetProcessors().ToList();//have to .ToList() since the deferred execution will cause a new instance of each processor to be created with every page
            IWebCrawler crawler = CreateCrawlerInstance();

            DomainCrawlResult domainCrawlResult = new DomainCrawlResult();

            domainCrawlResult.Domain = domain;
            try
            {
                crawler.CrawlBag.GoDaddyProcessorContext = new ProcessorContext
                {
                    Domain = domain,
                    PrimaryPersistenceProvider = _processorContext.PrimaryPersistenceProvider,
                    BackupPersistenceProvider  = _processorContext.BackupPersistenceProvider,
                    CrawlProcessors            = processors
                };

                domainCrawlResult.CrawlResult = crawler.Crawl(domain.Uri, cancellationToken);

                ProcessCrawledDomain(domainCrawlResult.CrawlResult.CrawlContext);
            }
            catch (Exception ex)
            {
                string errorMessage = string.Format("Exception occurred while crawling [{0}], error: [{1}]", domain.Uri.AbsoluteUri, ex.Message);
                domainCrawlResult.CrawlResult = new CrawlResult {
                    ErrorException = ex
                };

                _logger.ErrorFormat(errorMessage, ex);
                //TODO Statsg fatal error occurred during crawl
                StatsGLoggerAppender.LogItem(StatLogType.CrawlDaddy_FatalErrorOccured, _config);
            }

            LogCrawlResult(domainCrawlResult.CrawlResult);
            return(domainCrawlResult);
        }
Пример #6
0
        protected void RunConsumers(CancellationTokenSource cancellationToken)
        {
            ManualResetEvent manualReset = new ManualResetEvent(true);
            Object           locker      = new Object();

            ParallelOptions options = new ParallelOptions
            {
                CancellationToken      = cancellationToken.Token,
                MaxDegreeOfParallelism = (_config.MaxConcurrentCrawls > 0) ? _config.MaxConcurrentCrawls : System.Environment.ProcessorCount
            };

            Parallel.ForEach(_domainsToCrawl.GetConsumingEnumerable(), options, domain =>
            {
                if (cancellationToken.IsCancellationRequested)
                {
                    cancellationToken.Token.ThrowIfCancellationRequested();
                }

                DomainCrawlResult domainCrawlResult = null;
                try
                {
                    domainCrawlResult = _domainConsumer.Consume(domain, cancellationToken);
                }
                catch (Exception e)
                {
                    _logger.ErrorFormat("Domain consumer [{0}] threw exception during Consume().", _domainConsumer.ToString());
                    _logger.Error(e);
                    Stop();
                }

                if (domainCrawlResult != null)
                {
                    lock (_crawledDomains)
                    {
                        _crawledDomains.Add(domainCrawlResult);
                    }
                }
            });
        }
Пример #7
0
        public DomainCrawlResult Consume(Domain domain, CancellationTokenSource cancellationToken)
        {
            if (domain == null)
            {
                throw new ArgumentNullException("domain");
            }

            if (cancellationToken == null)
            {
                throw new ArgumentNullException("cancellationToken");
            }

            LogCrawlBegin(domain);

            IEnumerable <ICrawlProcessor> processors = _processorProvider.GetProcessors().ToList();//have to .ToList() since the deferred execution will cause a new instance of each processor to be created with every page
            IWebCrawler crawler = CreateCrawlerInstance();

            DomainCrawlResult domainCrawlResult = new DomainCrawlResult();

            domainCrawlResult.Domain = domain;
            try
            {
                crawler.CrawlBag.GoDaddyProcessorContext = new ProcessorContext
                {
                    Domain = domain,
                    PrimaryPersistenceProvider = _processorContext.PrimaryPersistenceProvider,
                    BackupPersistenceProvider  = _processorContext.BackupPersistenceProvider,
                    CrawlProcessors            = processors
                };

                //call parkedpage processor.  if parked, no need to crawl anything
                ICrawlProcessor parkedProc = processors.FirstOrDefault(p => p.GetType().Name == "ParkedCrawlProcessor");
                CrawlContext    cc         = new CrawlContext {
                    RootUri = domain.Uri, CrawlBag = crawler.CrawlBag
                };
                if (!Object.Equals(null, parkedProc))
                {
                    parkedProc.ProcessCrawledDomain(cc);
                }

                //if not parked or theres no parked processor, continue crawling the site
                if (Object.Equals(null, parkedProc) || !cc.CrawlBag.NoCrawl)
                {
                    domainCrawlResult.CrawlResult = crawler.Crawl(domain.Uri, cancellationToken);
                    ProcessCrawledDomain(domainCrawlResult.CrawlResult.CrawlContext);
                }
            }
            catch (Exception ex)
            {
                string errorMessage = string.Format("Exception occurred while crawling [{0}], error: [{1}]", domain.Uri.AbsoluteUri, ex.Message);
                domainCrawlResult.CrawlResult = new CrawlResult {
                    ErrorException = ex
                };

                _logger.ErrorFormat(errorMessage, ex);
            }

            if (!Object.Equals(null, domainCrawlResult.CrawlResult)) //could be null if we don't crawl it due to being a parked page or no A record
            {
                LogCrawlResult(domainCrawlResult.CrawlResult);
            }

            return(domainCrawlResult);
        }