private bool ShouldCrawlNextPages() { CrawlQueueItem currentCrawler = null; if (_pagesToCrawl.Count > 0) { currentCrawler = _pagesToCrawl.Peek(); } return(!_contentFound && _crawledUri.Count + _currentTasks.Count < _configuration.MaxPages && (currentCrawler == null || currentCrawler.PageLevel < _configuration.MaxPageLevel)); }
private async Task <Tuple <CrawlPageResults, int> > CrawlPage(CrawlQueueItem crawlItem, CancellationToken cancellationToken) { CrawlPageResults results = null; try { // Раскладываем в корень страницы. using (FileStream pageStream = new FileStream( Path.Combine( _domainDirectory, GetPageFileNameByUri(crawlItem.PageUri)), FileMode.Create)) { var pageCrawler = new PageCrawler(crawlItem.PageUri); results = await pageCrawler.StartCrawling(pageStream, cancellationToken); // В прекрасном мире, здесь был бы стрим, который пропускал через себя страницу, // которую по мере вычитывания ему давал PageCrawler и, тем самым проверялось // бы и стоп условие и все шло бы без излишних проверок и прочего, но имеем то, // что имеем. if (!string.IsNullOrEmpty(_configuration.StopString) && results.PageContent.Contains(_configuration.StopString)) { Logger.Log( LogLevel.Info, $"Stop string found at {crawlItem.PageUri}"); _contentFound = true; } } await LoadContent(results); } catch (Exception e) { Logger.Log( LogLevel.Error, e, $"Page crawl exception with uri {crawlItem.PageUri}"); } finally { } return(new Tuple <CrawlPageResults, int>(results, crawlItem.PageLevel)); }
private void EnqeueCrawlers(IEnumerable <Uri> nextPages, int parentLevel) { foreach (var page in nextPages) { if (!_queuedUri.Contains(page) && !_crawledUri.Contains(page)) { var item = new CrawlQueueItem() { PageUri = page, PageLevel = parentLevel + 1 }; _queuedUri.Add(page); _pagesToCrawl.Enqueue(item); } } }
public async Task <DomainCrawlStatistics> CrawlDomain(CancellationToken cancellationToken) { CancellationToken resultToken = CancellationTokenSource.CreateLinkedTokenSource( cancellationToken, _internalCTS.Token).Token; _contentFound = false; await Task.Factory.StartNew( async() => { InitQueue(); Directory.CreateDirectory(_domainDirectory); RobotsParams robotsParams = await _robotsReader.GetRobotsParams(); while ((_pagesToCrawl.Count > 0 && ShouldCrawlNextPages() || _currentTasks.Count > 0)) { foreach (var task in _currentTasks.Where(t => t.IsCompleted).ToArray()) { if (!task.IsFaulted && !task.IsCanceled) { var results = task.Result; UpdateProgress(results.Item1); if (results.Item2 < _configuration.MaxPageLevel) { EnqeueCrawlers(results.Item1.References, results.Item2); } _crawledUri.Add(task.Result.Item1.CrawledUri); _queuedUri.Remove(task.Result.Item1.CrawledUri); } _currentTasks.Remove(task); } if (_currentTasks.Count >= _configuration.MaxTasks) { await Task.WhenAny(_currentTasks.ToArray()); continue; } if (_pagesToCrawl.Count > 0 && ShouldCrawlNextPages()) { CrawlQueueItem currentCrawler = _pagesToCrawl.Dequeue(); _currentTasks.Add(CrawlPage(currentCrawler, resultToken)); if (robotsParams.CrawlDelay > 0) { await Task.Delay(robotsParams.CrawlDelay, resultToken); } } } SaveProgress(); }, TaskCreationOptions.LongRunning); return(_statistics); }