protected void RunConsumer(CancellationTokenSource cancellationToken) { foreach (Domain domain in _domainsToCrawl.GetConsumingEnumerable()) { DomainCrawlResult domainCrawlResult = null; try { domainCrawlResult = _domainConsumer.Consume(domain, cancellationToken); } catch (Exception e) { _logger.ErrorFormat("Domain consumer [{0}] threw exception during Consume().", _domainConsumer.ToString()); _logger.Error(e); Stop(); } if (domainCrawlResult != null) { lock (_crawledDomains) { _crawledDomains.Add(domainCrawlResult); } } if (cancellationToken.IsCancellationRequested) { cancellationToken.Token.ThrowIfCancellationRequested(); } } }
public void Constructor() { DomainCrawlResult uut = new DomainCrawlResult(); Assert.IsNull(uut.CrawlResult); Assert.IsNull(uut.Domain); }
public void Consume_ValidDomain_CrawlerCrawlBagSet() { //Arrange Domain domain = new Domain { DomainId = 1, Uri = new Uri("http://a.com") }; CrawlContext context = GetCrawlContext(_dummyCrawlProcessors); CrawlResult fakeResult = new CrawlResult { CrawlContext = context }; _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object); _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Returns(fakeResult); _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors); //Act DomainCrawlResult result = _uut.Consume(domain, _dummyCancellationToken); //Assert _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1)); _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1)); _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1)); Assert.AreEqual(domain, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.Domain); Assert.AreEqual(_dummyProcessorContext.PrimaryPersistenceProvider, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.PrimaryPersistenceProvider); Assert.AreEqual(_dummyProcessorContext.BackupPersistenceProvider, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.BackupPersistenceProvider); Assert.AreEqual(_dummyCrawlProcessors, _fakeWebCrawler.Object.CrawlBag.GoDaddyProcessorContext.CrawlProcessors); }
public void Consume_CrawlerThrowsException_PageAndDomainNotProcessed() { //Arrange Exception ex = new Exception("oh no"); _fakeWebCrawlerFactory.Setup(f => f.CreateInstance()).Returns(_fakeWebCrawler.Object); _fakeWebCrawler.Setup(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>())).Throws(ex); _fakeProcessorProvider.Setup(f => f.GetProcessors()).Returns(_dummyCrawlProcessors); //Act DomainCrawlResult result = _uut.Consume(new Domain { DomainId = 1, Uri = new Uri("http://a.com") }, _dummyCancellationToken); //Assert _fakeProcessorProvider.Verify(f => f.GetProcessors(), Times.Exactly(1)); _fakeWebCrawlerFactory.Verify(f => f.CreateInstance(), Times.Exactly(1)); _fakeWebCrawler.Verify(f => f.Crawl(It.IsAny <Uri>(), It.IsAny <CancellationTokenSource>()), Times.Exactly(1)); Assert.IsTrue(result.CrawlResult.ErrorOccurred); Assert.AreEqual(ex, result.CrawlResult.ErrorException); _fakeProcessor1.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(0)); _fakeProcessor2.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(0)); _fakeProcessor3.Verify(f => f.ProcessCrawledPage(It.IsAny <CrawlContext>(), It.IsAny <CrawledPage>()), Times.Exactly(0)); _fakeProcessor1.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(0)); _fakeProcessor1.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(0)); _fakeProcessor1.Verify(f => f.ProcessCrawledDomain(It.IsAny <CrawlContext>()), Times.Exactly(0)); }
public DomainCrawlResult Consume(Domain domain, CancellationTokenSource cancellationToken) { if (domain == null) { throw new ArgumentNullException("domain"); } if (cancellationToken == null) { throw new ArgumentNullException("cancellationToken"); } IEnumerable <ICrawlProcessor> processors = _processorProvider.GetProcessors().ToList();//have to .ToList() since the deferred execution will cause a new instance of each processor to be created with every page IWebCrawler crawler = CreateCrawlerInstance(); DomainCrawlResult domainCrawlResult = new DomainCrawlResult(); domainCrawlResult.Domain = domain; try { crawler.CrawlBag.GoDaddyProcessorContext = new ProcessorContext { Domain = domain, PrimaryPersistenceProvider = _processorContext.PrimaryPersistenceProvider, BackupPersistenceProvider = _processorContext.BackupPersistenceProvider, CrawlProcessors = processors }; domainCrawlResult.CrawlResult = crawler.Crawl(domain.Uri, cancellationToken); ProcessCrawledDomain(domainCrawlResult.CrawlResult.CrawlContext); } catch (Exception ex) { string errorMessage = string.Format("Exception occurred while crawling [{0}], error: [{1}]", domain.Uri.AbsoluteUri, ex.Message); domainCrawlResult.CrawlResult = new CrawlResult { ErrorException = ex }; _logger.ErrorFormat(errorMessage, ex); //TODO Statsg fatal error occurred during crawl StatsGLoggerAppender.LogItem(StatLogType.CrawlDaddy_FatalErrorOccured, _config); } LogCrawlResult(domainCrawlResult.CrawlResult); return(domainCrawlResult); }
protected void RunConsumers(CancellationTokenSource cancellationToken) { ManualResetEvent manualReset = new ManualResetEvent(true); Object locker = new Object(); ParallelOptions options = new ParallelOptions { CancellationToken = cancellationToken.Token, MaxDegreeOfParallelism = (_config.MaxConcurrentCrawls > 0) ? _config.MaxConcurrentCrawls : System.Environment.ProcessorCount }; Parallel.ForEach(_domainsToCrawl.GetConsumingEnumerable(), options, domain => { if (cancellationToken.IsCancellationRequested) { cancellationToken.Token.ThrowIfCancellationRequested(); } DomainCrawlResult domainCrawlResult = null; try { domainCrawlResult = _domainConsumer.Consume(domain, cancellationToken); } catch (Exception e) { _logger.ErrorFormat("Domain consumer [{0}] threw exception during Consume().", _domainConsumer.ToString()); _logger.Error(e); Stop(); } if (domainCrawlResult != null) { lock (_crawledDomains) { _crawledDomains.Add(domainCrawlResult); } } }); }
public DomainCrawlResult Consume(Domain domain, CancellationTokenSource cancellationToken) { if (domain == null) { throw new ArgumentNullException("domain"); } if (cancellationToken == null) { throw new ArgumentNullException("cancellationToken"); } LogCrawlBegin(domain); IEnumerable <ICrawlProcessor> processors = _processorProvider.GetProcessors().ToList();//have to .ToList() since the deferred execution will cause a new instance of each processor to be created with every page IWebCrawler crawler = CreateCrawlerInstance(); DomainCrawlResult domainCrawlResult = new DomainCrawlResult(); domainCrawlResult.Domain = domain; try { crawler.CrawlBag.GoDaddyProcessorContext = new ProcessorContext { Domain = domain, PrimaryPersistenceProvider = _processorContext.PrimaryPersistenceProvider, BackupPersistenceProvider = _processorContext.BackupPersistenceProvider, CrawlProcessors = processors }; //call parkedpage processor. if parked, no need to crawl anything ICrawlProcessor parkedProc = processors.FirstOrDefault(p => p.GetType().Name == "ParkedCrawlProcessor"); CrawlContext cc = new CrawlContext { RootUri = domain.Uri, CrawlBag = crawler.CrawlBag }; if (!Object.Equals(null, parkedProc)) { parkedProc.ProcessCrawledDomain(cc); } //if not parked or theres no parked processor, continue crawling the site if (Object.Equals(null, parkedProc) || !cc.CrawlBag.NoCrawl) { domainCrawlResult.CrawlResult = crawler.Crawl(domain.Uri, cancellationToken); ProcessCrawledDomain(domainCrawlResult.CrawlResult.CrawlContext); } } catch (Exception ex) { string errorMessage = string.Format("Exception occurred while crawling [{0}], error: [{1}]", domain.Uri.AbsoluteUri, ex.Message); domainCrawlResult.CrawlResult = new CrawlResult { ErrorException = ex }; _logger.ErrorFormat(errorMessage, ex); } if (!Object.Equals(null, domainCrawlResult.CrawlResult)) //could be null if we don't crawl it due to being a parked page or no A record { LogCrawlResult(domainCrawlResult.CrawlResult); } return(domainCrawlResult); }