public static void LogItem(string statsGTypeName, CrawlDaddyConfig configuration, float value = 1) { if (configuration.StatsGEnabled) { StatsGLoggerAppender logger = GetSingleton(configuration); logger.LogItem(statsGTypeName, value); } }
//private void ProcessCrawledPage(CrawlContext crawlContext, CrawledPage crawledPage) //{ // IEnumerable<ICrawlProcessor> processors = crawlContext.CrawlBag.GoDaddyProcessorContext.CrawlProcessors; // Task timedPageProcessorTask; // CancellationTokenSource tokenSource; // System.Timers.Timer timeoutTimer; // Stopwatch timer; // foreach (ICrawlProcessor processor in processors) // { // tokenSource = new CancellationTokenSource(); // timeoutTimer = new System.Timers.Timer(_config.MaxPageProcessorTimeInMilliSecs); // timeoutTimer.Elapsed += (sender, e) => // { // timeoutTimer.Stop(); // tokenSource.Cancel(); // _logger.ErrorFormat("Crawled page processor [{0}] timed out on page [{1}]. Max configured processing time is [{2}] millisecs.", processor.ToString(), crawledPage.Uri, _config.MaxPageProcessorTimeInMilliSecs); // }; // try // { // timeoutTimer.Start(); // timer = Stopwatch.StartNew(); // timedPageProcessorTask = Task.Factory.StartNew(() => processor.ProcessCrawledPage(crawlContext, crawledPage), tokenSource.Token, TaskCreationOptions.LongRunning, TaskScheduler.Default); // timedPageProcessorTask.Wait(_config.MaxPageProcessorTimeInMilliSecs + 50);//wait an additional 50 millisecs give timeouttimer a chance to log // timeoutTimer.Stop(); // timer.Stop(); // _logger.DebugFormat("Crawled page processor [{0}] completed processing page [{1}] in [{2}] millisecs.", processor.ToString(), crawledPage.Uri, timer.ElapsedMilliseconds); // } // catch (AggregateException ae) // { // timeoutTimer.Stop(); // _logger.ErrorFormat("Crawled page processor [{0}] threw exception while processing page [{1}]", processor.ToString(), crawledPage.Uri); // _logger.Error(ae); // } // } //} //private void ProcessCrawledDomain(CrawlContext crawlContext) //{ // IEnumerable<ICrawlProcessor> processors = crawlContext.CrawlBag.GoDaddyProcessorContext.CrawlProcessors; // Task timedPageProcessorTask; // CancellationTokenSource tokenSource; // System.Timers.Timer timeoutTimer; // Stopwatch timer; // foreach (ICrawlProcessor processor in processors) // { // tokenSource = new CancellationTokenSource(); // timeoutTimer = new System.Timers.Timer(_config.MaxDomainProcessorTimeInMilliSecs); // timeoutTimer.Elapsed += (sender, e) => // { // timeoutTimer.Stop(); // tokenSource.Cancel(); // _logger.ErrorFormat("Crawled domain processor [{0}] timed out on domain [{1}]. Max configured processing time is [{2}] millisecs.", processor.ToString(), crawlContext.RootUri, _config.MaxDomainProcessorTimeInMilliSecs); // }; // try // { // timeoutTimer.Start(); // timer = Stopwatch.StartNew(); // timedPageProcessorTask = Task.Factory.StartNew(() => processor.ProcessCrawledDomain(crawlContext), tokenSource.Token, TaskCreationOptions.LongRunning, TaskScheduler.Default); // timedPageProcessorTask.Wait(_config.MaxDomainProcessorTimeInMilliSecs + 50);//wait an additional 50 millisecs give timeouttimer a chance to log // timeoutTimer.Stop(); // timer.Stop(); // _logger.DebugFormat("Crawled domain processor [{0}] completed processing domain [{1}] in [{2}] millisecs.", processor.ToString(), crawlContext.RootUri, timer.ElapsedMilliseconds); // } // catch (AggregateException ae) // { // timeoutTimer.Stop(); // _logger.ErrorFormat("Crawled domain processor [{0}] threw exception while processing domain [{1}]", processor.ToString(), crawlContext.RootUri); // _logger.Error(ae); // } // } //} private void LogCrawlResult(CrawlResult crawlResult) { if (crawlResult.ErrorOccurred) { _logger.ErrorFormat("Crawl for domain [{0}] failed after [{1}] seconds, crawled [{2}] pages", crawlResult.RootUri, crawlResult.Elapsed.TotalSeconds, crawlResult.CrawlContext != null ? crawlResult.CrawlContext.CrawledUrls.Count : 0); _logger.Error(crawlResult.ErrorException); //TODO Statsg error occurred during crawl StatsGLoggerAppender.LogItem(StatLogType.CrawlDaddy_ErrorOccuredDuringCrawl, _config); } else { _logger.InfoFormat("Crawl for domain [{0}] completed successfully in [{1}] seconds, crawled [{2}] pages", crawlResult.RootUri, crawlResult.Elapsed.TotalSeconds, crawlResult.CrawlContext.CrawledUrls.Count); _throughputLogger.InfoFormat("Crawl for domain [{0}] completed successfully in [{1}] seconds, crawled [{2}] pages", crawlResult.RootUri, crawlResult.Elapsed.TotalSeconds, crawlResult.CrawlContext.CrawledUrls.Count); } }
private static StatsGLoggerAppender GetSingleton(CrawlDaddyConfig config) { // if this is the first time it is being called, we set up the private static members if (singleton == null) { lock (singletonLock) { if (singleton == null) { singleton = new StatsGLoggerAppender(config); } } } return(singleton); }
public DomainCrawlResult Consume(Domain domain, CancellationTokenSource cancellationToken) { if (domain == null) { throw new ArgumentNullException("domain"); } if (cancellationToken == null) { throw new ArgumentNullException("cancellationToken"); } IEnumerable <ICrawlProcessor> processors = _processorProvider.GetProcessors().ToList();//have to .ToList() since the deferred execution will cause a new instance of each processor to be created with every page IWebCrawler crawler = CreateCrawlerInstance(); DomainCrawlResult domainCrawlResult = new DomainCrawlResult(); domainCrawlResult.Domain = domain; try { crawler.CrawlBag.GoDaddyProcessorContext = new ProcessorContext { Domain = domain, PrimaryPersistenceProvider = _processorContext.PrimaryPersistenceProvider, BackupPersistenceProvider = _processorContext.BackupPersistenceProvider, CrawlProcessors = processors }; domainCrawlResult.CrawlResult = crawler.Crawl(domain.Uri, cancellationToken); ProcessCrawledDomain(domainCrawlResult.CrawlResult.CrawlContext); } catch (Exception ex) { string errorMessage = string.Format("Exception occurred while crawling [{0}], error: [{1}]", domain.Uri.AbsoluteUri, ex.Message); domainCrawlResult.CrawlResult = new CrawlResult { ErrorException = ex }; _logger.ErrorFormat(errorMessage, ex); //TODO Statsg fatal error occurred during crawl StatsGLoggerAppender.LogItem(StatLogType.CrawlDaddy_FatalErrorOccured, _config); } LogCrawlResult(domainCrawlResult.CrawlResult); return(domainCrawlResult); }