Ejemplo n.º 1
0
        public static void LogItem(string statsGTypeName, CrawlDaddyConfig configuration, float value = 1)
        {
            if (configuration.StatsGEnabled)
            {
                StatsGLoggerAppender logger = GetSingleton(configuration);

                logger.LogItem(statsGTypeName, value);
            }
        }
Ejemplo n.º 2
0
        //private void ProcessCrawledPage(CrawlContext crawlContext, CrawledPage crawledPage)
        //{
        //    IEnumerable<ICrawlProcessor> processors = crawlContext.CrawlBag.GoDaddyProcessorContext.CrawlProcessors;

        //    Task timedPageProcessorTask;
        //    CancellationTokenSource tokenSource;
        //    System.Timers.Timer timeoutTimer;
        //    Stopwatch timer;
        //    foreach (ICrawlProcessor processor in processors)
        //    {
        //        tokenSource = new CancellationTokenSource();
        //        timeoutTimer = new System.Timers.Timer(_config.MaxPageProcessorTimeInMilliSecs);
        //        timeoutTimer.Elapsed += (sender, e) =>
        //        {
        //            timeoutTimer.Stop();
        //            tokenSource.Cancel();
        //            _logger.ErrorFormat("Crawled page processor [{0}] timed out on page [{1}]. Max configured processing time is [{2}] millisecs.", processor.ToString(), crawledPage.Uri, _config.MaxPageProcessorTimeInMilliSecs);
        //        };

        //        try
        //        {
        //            timeoutTimer.Start();
        //            timer = Stopwatch.StartNew();
        //            timedPageProcessorTask = Task.Factory.StartNew(() => processor.ProcessCrawledPage(crawlContext, crawledPage), tokenSource.Token, TaskCreationOptions.LongRunning, TaskScheduler.Default);
        //            timedPageProcessorTask.Wait(_config.MaxPageProcessorTimeInMilliSecs + 50);//wait an additional 50 millisecs give timeouttimer a chance to log
        //            timeoutTimer.Stop();
        //            timer.Stop();
        //            _logger.DebugFormat("Crawled page processor [{0}] completed processing page [{1}] in [{2}] millisecs.", processor.ToString(), crawledPage.Uri, timer.ElapsedMilliseconds);
        //        }
        //        catch (AggregateException ae)
        //        {
        //            timeoutTimer.Stop();
        //            _logger.ErrorFormat("Crawled page processor [{0}] threw exception while processing page [{1}]", processor.ToString(), crawledPage.Uri);
        //            _logger.Error(ae);
        //        }
        //    }
        //}

        //private void ProcessCrawledDomain(CrawlContext crawlContext)
        //{
        //    IEnumerable<ICrawlProcessor> processors = crawlContext.CrawlBag.GoDaddyProcessorContext.CrawlProcessors;

        //    Task timedPageProcessorTask;
        //    CancellationTokenSource tokenSource;
        //    System.Timers.Timer timeoutTimer;
        //    Stopwatch timer;
        //    foreach (ICrawlProcessor processor in processors)
        //    {
        //        tokenSource = new CancellationTokenSource();
        //        timeoutTimer = new System.Timers.Timer(_config.MaxDomainProcessorTimeInMilliSecs);
        //        timeoutTimer.Elapsed += (sender, e) =>
        //        {
        //            timeoutTimer.Stop();
        //            tokenSource.Cancel();
        //            _logger.ErrorFormat("Crawled domain processor [{0}] timed out on domain [{1}]. Max configured processing time is [{2}] millisecs.", processor.ToString(), crawlContext.RootUri, _config.MaxDomainProcessorTimeInMilliSecs);
        //        };

        //        try
        //        {
        //            timeoutTimer.Start();
        //            timer = Stopwatch.StartNew();
        //            timedPageProcessorTask = Task.Factory.StartNew(() => processor.ProcessCrawledDomain(crawlContext), tokenSource.Token, TaskCreationOptions.LongRunning, TaskScheduler.Default);
        //            timedPageProcessorTask.Wait(_config.MaxDomainProcessorTimeInMilliSecs + 50);//wait an additional 50 millisecs give timeouttimer a chance to log
        //            timeoutTimer.Stop();
        //            timer.Stop();
        //            _logger.DebugFormat("Crawled domain processor [{0}] completed processing domain [{1}] in [{2}] millisecs.", processor.ToString(), crawlContext.RootUri, timer.ElapsedMilliseconds);
        //        }
        //        catch (AggregateException ae)
        //        {
        //            timeoutTimer.Stop();
        //            _logger.ErrorFormat("Crawled domain processor [{0}] threw exception while processing domain [{1}]", processor.ToString(), crawlContext.RootUri);
        //            _logger.Error(ae);
        //        }
        //    }
        //}

        private void LogCrawlResult(CrawlResult crawlResult)
        {
            if (crawlResult.ErrorOccurred)
            {
                _logger.ErrorFormat("Crawl for domain [{0}] failed after [{1}] seconds, crawled [{2}] pages", crawlResult.RootUri, crawlResult.Elapsed.TotalSeconds, crawlResult.CrawlContext != null ? crawlResult.CrawlContext.CrawledUrls.Count : 0);
                _logger.Error(crawlResult.ErrorException);
                //TODO Statsg error occurred during crawl
                StatsGLoggerAppender.LogItem(StatLogType.CrawlDaddy_ErrorOccuredDuringCrawl, _config);
            }
            else
            {
                _logger.InfoFormat("Crawl for domain [{0}] completed successfully in [{1}] seconds, crawled [{2}] pages", crawlResult.RootUri, crawlResult.Elapsed.TotalSeconds, crawlResult.CrawlContext.CrawledUrls.Count);
                _throughputLogger.InfoFormat("Crawl for domain [{0}] completed successfully in [{1}] seconds, crawled [{2}] pages", crawlResult.RootUri, crawlResult.Elapsed.TotalSeconds, crawlResult.CrawlContext.CrawledUrls.Count);
            }
        }
Ejemplo n.º 3
0
        private static StatsGLoggerAppender GetSingleton(CrawlDaddyConfig config)
        {
            // if this is the first time it is being called, we set up the private static members
            if (singleton == null)
            {
                lock (singletonLock)
                {
                    if (singleton == null)
                    {
                        singleton = new StatsGLoggerAppender(config);
                    }
                }
            }

            return(singleton);
        }
Ejemplo n.º 4
0
        public DomainCrawlResult Consume(Domain domain, CancellationTokenSource cancellationToken)
        {
            if (domain == null)
            {
                throw new ArgumentNullException("domain");
            }

            if (cancellationToken == null)
            {
                throw new ArgumentNullException("cancellationToken");
            }

            IEnumerable <ICrawlProcessor> processors = _processorProvider.GetProcessors().ToList();//have to .ToList() since the deferred execution will cause a new instance of each processor to be created with every page
            IWebCrawler crawler = CreateCrawlerInstance();

            DomainCrawlResult domainCrawlResult = new DomainCrawlResult();

            domainCrawlResult.Domain = domain;
            try
            {
                crawler.CrawlBag.GoDaddyProcessorContext = new ProcessorContext
                {
                    Domain = domain,
                    PrimaryPersistenceProvider = _processorContext.PrimaryPersistenceProvider,
                    BackupPersistenceProvider  = _processorContext.BackupPersistenceProvider,
                    CrawlProcessors            = processors
                };

                domainCrawlResult.CrawlResult = crawler.Crawl(domain.Uri, cancellationToken);

                ProcessCrawledDomain(domainCrawlResult.CrawlResult.CrawlContext);
            }
            catch (Exception ex)
            {
                string errorMessage = string.Format("Exception occurred while crawling [{0}], error: [{1}]", domain.Uri.AbsoluteUri, ex.Message);
                domainCrawlResult.CrawlResult = new CrawlResult {
                    ErrorException = ex
                };

                _logger.ErrorFormat(errorMessage, ex);
                //TODO Statsg fatal error occurred during crawl
                StatsGLoggerAppender.LogItem(StatLogType.CrawlDaddy_FatalErrorOccured, _config);
            }

            LogCrawlResult(domainCrawlResult.CrawlResult);
            return(domainCrawlResult);
        }