public WebsiteCrawler(ICrawlScheduler crawlScheduler, ICrawlPersister crawlPersister) { this.crawlScheduler = crawlScheduler; this.crawlPersister = crawlPersister; crawlScheduler.PageScheduled += crawlUrl => { Interlocked.Increment(ref scheduledUrlsCount); // Log.DebugFormat("Scheduled '{0}' - scheduled '{1}', processing '{2}'", crawlUrl.Url, scheduledUrlsCount, processingUrlsCount); }; crawlScheduler.PageProcessing += crawlUrl => { Interlocked.Increment(ref processingUrlsCount); Log.InfoFormat("Processing '{0}' - scheduled '{1}', processing '{2}'", crawlUrl.Url, scheduledUrlsCount, processingUrlsCount); }; crawlScheduler.PageCrawled += crawlResult => { try { Interlocked.Decrement(ref processingUrlsCount); Interlocked.Decrement(ref scheduledUrlsCount); Interlocked.Increment(ref totalCrawledCount); var elapsed = DateTime.Now - crawlStarted; Log.InfoFormat("Crawled '{0}' - scheduled '{1}', processing '{2}', crawled {3} in {4}", crawlResult.CrawlUrl.Url, scheduledUrlsCount, processingUrlsCount, totalCrawledCount, elapsed); crawlPersister.Save(crawlResult); crawlResult.CrawlUrl.WebsiteDefinition.Website.LastVisit = DateTimeOffset.Now; crawlPersister.Save(crawlResult.CrawlUrl.WebsiteDefinition.Website); } catch (Exception ex) { Log.Error(ex); } }; crawlScheduler.WebsiteScheduled += website => { try { Log.InfoFormat("Added website {0}", website.RootUrl); website.LastCrawlStartedAt = DateTimeOffset.Now; website.PagesCrawled = 0; crawlPersister.Save(website); } catch (Exception ex) { Log.Error(ex); } }; }
public WebsiteCrawler(ICrawlScheduler crawlScheduler, ICrawlPersister crawlPersister) { this.crawlScheduler = crawlScheduler; this.crawlPersister = crawlPersister; crawlScheduler.PageScheduled += crawlUrl => { Interlocked.Increment(ref scheduledUrlsCount); // Log.DebugFormat("Scheduled '{0}' - scheduled '{1}', processing '{2}'", crawlUrl.Url, scheduledUrlsCount, processingUrlsCount); }; crawlScheduler.PageProcessing += crawlUrl => { Interlocked.Increment(ref processingUrlsCount); Log.InfoFormat("Processing '{0}' - scheduled '{1}', processing '{2}'", crawlUrl.Url, scheduledUrlsCount, processingUrlsCount); }; crawlScheduler.PageCrawled += crawlResult => { try { Interlocked.Decrement(ref processingUrlsCount); Interlocked.Decrement(ref scheduledUrlsCount); Interlocked.Increment(ref totalCrawledCount); var elapsed = DateTime.Now - crawlStarted; Log.InfoFormat("Crawled '{0}' - scheduled '{1}', processing '{2}', crawled {3} in {4}", crawlResult.CrawlUrl.Url, scheduledUrlsCount, processingUrlsCount, totalCrawledCount, elapsed); crawlPersister.Save(crawlResult); crawlResult.CrawlUrl.WebsiteDefinition.Website.LastVisit = DateTimeOffset.Now; crawlPersister.Save(crawlResult.CrawlUrl.WebsiteDefinition.Website); } catch (Exception ex) { Log.Error(ex); } }; crawlScheduler.WebsiteScheduled += website => { try { Log.InfoFormat("Added website {0}", website.RootUrl); website.LastCrawlStartedAt = DateTimeOffset.Now; website.PagesCrawled = 0; crawlPersister.Save(website); } catch (Exception ex) { Log.Error(ex); } }; }
public WebCrawler(ILogger <WebCrawler> logger, CrawlConfiguration crawlConfiguration, IThreadManager threadManager, ICrawlDecisionMaker crawlDecisionMaker, ICrawlScheduler crawlScheduler, IPageRequester pageRequester, IDocumentParser hyperLinkParser, IRateLimiter rateLimiter) { _logger = logger; _crawlConfiguration = crawlConfiguration; _crawlContext = new CrawlContext(logger) { CrawlConfiguration = crawlConfiguration }; _threadManager = threadManager; _crawlDecisionMaker = crawlDecisionMaker; _scheduler = crawlScheduler; _pageRequester = pageRequester; _hyperLinkParser = hyperLinkParser; _rateLimiter = rateLimiter; PageCrawlStartingAsync += WebCrawler_PageCrawlStartingAsync; }
public ThinkWebCrawler(ILogger <WebCrawler> logger, CrawlConfiguration crawlConfiguration, IThreadManager threadManager, ICrawlDecisionMaker crawlDecisionMaker, ICrawlScheduler crawlScheduler, IPageRequester pageRequester, IDocumentParser hyperLinkParser, IRateLimiter rateLimiter, IOptions <ThinkCrawlConfiguration> thinkCrawlOpt) : base(logger, crawlConfiguration, threadManager, crawlDecisionMaker, crawlScheduler, pageRequester, hyperLinkParser, rateLimiter) { _thinkCrawlConfiguration = thinkCrawlOpt.Value; Check(); }