public WebsiteCrawler(ICrawlScheduler crawlScheduler, ICrawlPersister crawlPersister) { this.crawlScheduler = crawlScheduler; this.crawlPersister = crawlPersister; crawlScheduler.PageScheduled += crawlUrl => { Interlocked.Increment(ref scheduledUrlsCount); // Log.DebugFormat("Scheduled '{0}' - scheduled '{1}', processing '{2}'", crawlUrl.Url, scheduledUrlsCount, processingUrlsCount); }; crawlScheduler.PageProcessing += crawlUrl => { Interlocked.Increment(ref processingUrlsCount); Log.InfoFormat("Processing '{0}' - scheduled '{1}', processing '{2}'", crawlUrl.Url, scheduledUrlsCount, processingUrlsCount); }; crawlScheduler.PageCrawled += crawlResult => { try { Interlocked.Decrement(ref processingUrlsCount); Interlocked.Decrement(ref scheduledUrlsCount); Interlocked.Increment(ref totalCrawledCount); var elapsed = DateTime.Now - crawlStarted; Log.InfoFormat("Crawled '{0}' - scheduled '{1}', processing '{2}', crawled {3} in {4}", crawlResult.CrawlUrl.Url, scheduledUrlsCount, processingUrlsCount, totalCrawledCount, elapsed); crawlPersister.Save(crawlResult); crawlResult.CrawlUrl.WebsiteDefinition.Website.LastVisit = DateTimeOffset.Now; crawlPersister.Save(crawlResult.CrawlUrl.WebsiteDefinition.Website); } catch (Exception ex) { Log.Error(ex); } }; crawlScheduler.WebsiteScheduled += website => { try { Log.InfoFormat("Added website {0}", website.RootUrl); website.LastCrawlStartedAt = DateTimeOffset.Now; website.PagesCrawled = 0; crawlPersister.Save(website); } catch (Exception ex) { Log.Error(ex); } }; }
private void WebsiteCrawlFinished(Website website, CrawlResult result) { try { Log.InfoFormat("Finished crawl for website {0}", website.RootUrl); website.LastCrawlEndedAt = website.LastVisit; website.PagesCrawled = result.NumberOfPagesCrawled; crawlPersister.Save(website); } catch (Exception ex) { Log.Error(ex); } }