Beispiel #1
0
		public WebsiteCrawler(ICrawlScheduler crawlScheduler, ICrawlPersister crawlPersister)
		{
			this.crawlScheduler = crawlScheduler;
			this.crawlPersister = crawlPersister;

			crawlScheduler.PageScheduled += crawlUrl =>
				{
					Interlocked.Increment(ref scheduledUrlsCount);

//					Log.DebugFormat("Scheduled '{0}' - scheduled '{1}', processing '{2}'", crawlUrl.Url, scheduledUrlsCount, processingUrlsCount);
				};

			crawlScheduler.PageProcessing += crawlUrl =>
				{
					Interlocked.Increment(ref processingUrlsCount);

					Log.InfoFormat("Processing '{0}' - scheduled '{1}', processing '{2}'", crawlUrl.Url, scheduledUrlsCount, processingUrlsCount);
				};

			crawlScheduler.PageCrawled += crawlResult =>
				{
					try
					{
						Interlocked.Decrement(ref processingUrlsCount);
						Interlocked.Decrement(ref scheduledUrlsCount);
						Interlocked.Increment(ref totalCrawledCount);

						var elapsed = DateTime.Now - crawlStarted;

						Log.InfoFormat("Crawled '{0}' - scheduled '{1}', processing '{2}', crawled {3} in {4}", crawlResult.CrawlUrl.Url, scheduledUrlsCount, processingUrlsCount, totalCrawledCount, elapsed);

						crawlPersister.Save(crawlResult);

						crawlResult.CrawlUrl.WebsiteDefinition.Website.LastVisit = DateTimeOffset.Now;
						crawlPersister.Save(crawlResult.CrawlUrl.WebsiteDefinition.Website);
					}
					catch (Exception ex)
					{
						Log.Error(ex);
					}
				};

			crawlScheduler.WebsiteScheduled += website =>
				{
					try
					{
						Log.InfoFormat("Added website {0}", website.RootUrl);

						website.LastCrawlStartedAt = DateTimeOffset.Now;
						website.PagesCrawled = 0;

						crawlPersister.Save(website);
					}
					catch (Exception ex)
					{
						Log.Error(ex);
					}
				};
		}
Beispiel #2
0
        public WebsiteCrawler(ICrawlScheduler crawlScheduler, ICrawlPersister crawlPersister)
        {
            this.crawlScheduler = crawlScheduler;
            this.crawlPersister = crawlPersister;

            crawlScheduler.PageScheduled += crawlUrl =>
            {
                Interlocked.Increment(ref scheduledUrlsCount);

//					Log.DebugFormat("Scheduled '{0}' - scheduled '{1}', processing '{2}'", crawlUrl.Url, scheduledUrlsCount, processingUrlsCount);
            };

            crawlScheduler.PageProcessing += crawlUrl =>
            {
                Interlocked.Increment(ref processingUrlsCount);

                Log.InfoFormat("Processing '{0}' - scheduled '{1}', processing '{2}'", crawlUrl.Url, scheduledUrlsCount, processingUrlsCount);
            };

            crawlScheduler.PageCrawled += crawlResult =>
            {
                try
                {
                    Interlocked.Decrement(ref processingUrlsCount);
                    Interlocked.Decrement(ref scheduledUrlsCount);
                    Interlocked.Increment(ref totalCrawledCount);

                    var elapsed = DateTime.Now - crawlStarted;

                    Log.InfoFormat("Crawled '{0}' - scheduled '{1}', processing '{2}', crawled {3} in {4}", crawlResult.CrawlUrl.Url, scheduledUrlsCount, processingUrlsCount, totalCrawledCount, elapsed);

                    crawlPersister.Save(crawlResult);

                    crawlResult.CrawlUrl.WebsiteDefinition.Website.LastVisit = DateTimeOffset.Now;
                    crawlPersister.Save(crawlResult.CrawlUrl.WebsiteDefinition.Website);
                }
                catch (Exception ex)
                {
                    Log.Error(ex);
                }
            };

            crawlScheduler.WebsiteScheduled += website =>
            {
                try
                {
                    Log.InfoFormat("Added website {0}", website.RootUrl);

                    website.LastCrawlStartedAt = DateTimeOffset.Now;
                    website.PagesCrawled       = 0;

                    crawlPersister.Save(website);
                }
                catch (Exception ex)
                {
                    Log.Error(ex);
                }
            };
        }