public async Task<CrawlResult> RunAsync(Website target) { crawlStarted = DateTime.Now; return await crawlScheduler.Schedule(target).ContinueWith(t => { target.LastVisit = DateTimeOffset.Now; WebsiteCrawlFinished(target, t.Result); return t.Result; }); }
public Task<CrawlResult> Schedule(Website website) { var existing = websiteDefinitions.FirstOrDefault(x => x.Website == website); if (existing != null) return websiteProcessingDefinitions[existing].CompletionSource.Task; if (website == null || string.IsNullOrWhiteSpace(website.RootUrl)) { var cancelledTask = new TaskCompletionSource<CrawlResult>(); cancelledTask.SetCanceled(); return cancelledTask.Task; } website.RootUrl = website.RootUrl.Split('#')[0].TrimEnd('/'); WebsiteProcessingDefinition websiteProcessingDefinition; lock (websiteLock) { var processingBlock = CreateProcessingBlock(website); var websiteDefinition = new WebsiteDefinition { Website = website, CrawlResult = new CrawlResult(), }; websiteProcessingDefinition = new WebsiteProcessingDefinition(websiteDefinition) { ProcessingBlock = processingBlock, CompletionSource = new TaskCompletionSource<CrawlResult>() }; if (websiteProcessingDefinitions.TryAdd(websiteDefinition, websiteProcessingDefinition)) { websiteDefinitions.Add(websiteDefinition); } } RaiseWebsiteScheduled(website); var outstandingLinks = Schedule(new[] { website.RootUrl }); if (outstandingLinks > 0) ScheduleNext(); return websiteProcessingDefinition.CompletionSource.Task; }
private void WebsiteCrawlFinished(Website website, CrawlResult result) { try { Log.InfoFormat("Finished crawl for website {0}", website.RootUrl); website.LastCrawlEndedAt = website.LastVisit; website.PagesCrawled = result.NumberOfPagesCrawled; crawlPersister.Save(website); } catch (Exception ex) { Log.Error(ex); } }
public void Save(Website website) { using (var session = documentStore.OpenSession()) { var existing = session.Query<Website>().Where(w => w.RootUrl == website.RootUrl).FirstOrDefault() ?? website; existing.LastVisit = website.LastVisit; existing.LastCrawlStartedAt = website.LastCrawlStartedAt; existing.LastCrawlEndedAt = website.LastCrawlEndedAt; existing.PagesCrawled = website.PagesCrawled; existing.MaxConcurrentConnections = website.MaxConcurrentConnections; existing.IntervalBetweenVisits = website.IntervalBetweenVisits; existing.FollowExternalLinks = website.FollowExternalLinks; session.Store(existing); session.SaveChanges(); } }
private void RaiseWebsiteScheduled(Website website) { var handler = WebsiteScheduled; if (handler != null) handler.Invoke(website); }
private TransformBlock<CrawlUrl, PageCrawlResult> CreateProcessingBlock(Website website) { var processingBlock = new TransformBlock<CrawlUrl, PageCrawlResult>(crawlUrl => { var result = pageCrawler.Crawl(crawlUrl.Uri); result.CrawlUrl = crawlUrl; return result; }, new ExecutionDataflowBlockOptions { MaxDegreeOfParallelism = website.MaxConcurrentConnections > 0 ? website.MaxConcurrentConnections : configuration.MaxConcurrentConnectionsPerWebsite, }); var persistBlock = new TransformBlock<PageCrawlResult, PageCrawlResult>(result => { crawlUrlRepository.Done(result.CrawlUrl.Hash, result.CrawlUrl); Interlocked.Decrement(ref result.CrawlUrl.WebsiteDefinition.UrlsInProcess); return result; }); processingBlock.LinkTo(persistBlock); persistBlock.LinkTo(schedulingBlock); return processingBlock; }