예제 #1
0
		public async Task<CrawlResult> RunAsync(Website target)
		{
			crawlStarted = DateTime.Now;

			return await crawlScheduler.Schedule(target).ContinueWith(t =>
				{
					target.LastVisit = DateTimeOffset.Now;

					WebsiteCrawlFinished(target, t.Result);

					return t.Result;
				});
		}
예제 #2
0
        public Task<CrawlResult> Schedule(Website website)
        {
            var existing = websiteDefinitions.FirstOrDefault(x => x.Website == website);
            if (existing != null)
                return websiteProcessingDefinitions[existing].CompletionSource.Task;

            if (website == null || string.IsNullOrWhiteSpace(website.RootUrl))
            {
                var cancelledTask = new TaskCompletionSource<CrawlResult>();
                cancelledTask.SetCanceled();

                return cancelledTask.Task;
            }

            website.RootUrl = website.RootUrl.Split('#')[0].TrimEnd('/');

            WebsiteProcessingDefinition websiteProcessingDefinition;
            lock (websiteLock)
            {
                var processingBlock = CreateProcessingBlock(website);

                var websiteDefinition = new WebsiteDefinition
                    {
                        Website = website,
                        CrawlResult = new CrawlResult(),
                    };

                websiteProcessingDefinition = new WebsiteProcessingDefinition(websiteDefinition)
                    {
                        ProcessingBlock = processingBlock,
                        CompletionSource = new TaskCompletionSource<CrawlResult>()
                    };

                if (websiteProcessingDefinitions.TryAdd(websiteDefinition, websiteProcessingDefinition))
                {
                    websiteDefinitions.Add(websiteDefinition);
                }
            }

            RaiseWebsiteScheduled(website);

            var outstandingLinks = Schedule(new[] { website.RootUrl });
            if (outstandingLinks > 0)
                ScheduleNext();

            return websiteProcessingDefinition.CompletionSource.Task;
        }
예제 #3
0
		private void WebsiteCrawlFinished(Website website, CrawlResult result)
		{
			try
			{
				Log.InfoFormat("Finished crawl for website {0}", website.RootUrl);
				website.LastCrawlEndedAt = website.LastVisit;
				website.PagesCrawled = result.NumberOfPagesCrawled;

				crawlPersister.Save(website);
			}
			catch (Exception ex)
			{
				Log.Error(ex);
			}
		}
예제 #4
0
        public void Save(Website website)
        {
            using (var session = documentStore.OpenSession())
            {
                var existing = session.Query<Website>().Where(w => w.RootUrl == website.RootUrl).FirstOrDefault() ?? website;

                existing.LastVisit = website.LastVisit;
                existing.LastCrawlStartedAt = website.LastCrawlStartedAt;
                existing.LastCrawlEndedAt = website.LastCrawlEndedAt;
                existing.PagesCrawled = website.PagesCrawled;

                existing.MaxConcurrentConnections = website.MaxConcurrentConnections;
                existing.IntervalBetweenVisits = website.IntervalBetweenVisits;
                existing.FollowExternalLinks = website.FollowExternalLinks;

                session.Store(existing);
                session.SaveChanges();
            }
        }
예제 #5
0
 private void RaiseWebsiteScheduled(Website website)
 {
     var handler = WebsiteScheduled;
     if (handler != null)
         handler.Invoke(website);
 }
예제 #6
0
        private TransformBlock<CrawlUrl, PageCrawlResult> CreateProcessingBlock(Website website)
        {
            var processingBlock = new TransformBlock<CrawlUrl, PageCrawlResult>(crawlUrl =>
                {
                    var result = pageCrawler.Crawl(crawlUrl.Uri);
                    result.CrawlUrl = crawlUrl;

                    return result;
                }, new ExecutionDataflowBlockOptions
                    {
                        MaxDegreeOfParallelism =
                            website.MaxConcurrentConnections > 0
                                ? website.MaxConcurrentConnections
                                : configuration.MaxConcurrentConnectionsPerWebsite,
                    });

            var persistBlock = new TransformBlock<PageCrawlResult, PageCrawlResult>(result =>
                {
                    crawlUrlRepository.Done(result.CrawlUrl.Hash, result.CrawlUrl);

                    Interlocked.Decrement(ref result.CrawlUrl.WebsiteDefinition.UrlsInProcess);

                    return result;
                });

            processingBlock.LinkTo(persistBlock);
            persistBlock.LinkTo(schedulingBlock);

            return processingBlock;
        }