Exemplo n.º 1
0
        private bool ShouldCrawlNextPages()
        {
            CrawlQueueItem currentCrawler = null;

            if (_pagesToCrawl.Count > 0)
            {
                currentCrawler = _pagesToCrawl.Peek();
            }

            return(!_contentFound &&
                   _crawledUri.Count + _currentTasks.Count < _configuration.MaxPages &&
                   (currentCrawler == null || currentCrawler.PageLevel < _configuration.MaxPageLevel));
        }
Exemplo n.º 2
0
        private async Task <Tuple <CrawlPageResults, int> > CrawlPage(CrawlQueueItem crawlItem, CancellationToken cancellationToken)
        {
            CrawlPageResults results = null;

            try
            {
                // Раскладываем в корень страницы.
                using (FileStream pageStream = new FileStream(
                           Path.Combine(
                               _domainDirectory,
                               GetPageFileNameByUri(crawlItem.PageUri)),
                           FileMode.Create))
                {
                    var pageCrawler = new PageCrawler(crawlItem.PageUri);

                    results =
                        await pageCrawler.StartCrawling(pageStream, cancellationToken);

                    // В прекрасном мире, здесь был бы стрим, который пропускал через себя страницу,
                    // которую по мере вычитывания ему давал PageCrawler и, тем самым проверялось
                    // бы и стоп условие и все шло бы без излишних проверок и прочего, но имеем то,
                    // что имеем.
                    if (!string.IsNullOrEmpty(_configuration.StopString) &&
                        results.PageContent.Contains(_configuration.StopString))
                    {
                        Logger.Log(
                            LogLevel.Info,
                            $"Stop string found at {crawlItem.PageUri}");

                        _contentFound = true;
                    }
                }

                await LoadContent(results);
            }
            catch (Exception e)
            {
                Logger.Log(
                    LogLevel.Error,
                    e,
                    $"Page crawl exception with uri {crawlItem.PageUri}");
            }
            finally
            {
            }

            return(new Tuple <CrawlPageResults, int>(results, crawlItem.PageLevel));
        }
Exemplo n.º 3
0
        private void EnqeueCrawlers(IEnumerable <Uri> nextPages, int parentLevel)
        {
            foreach (var page in nextPages)
            {
                if (!_queuedUri.Contains(page) && !_crawledUri.Contains(page))
                {
                    var item = new CrawlQueueItem()
                    {
                        PageUri   = page,
                        PageLevel = parentLevel + 1
                    };

                    _queuedUri.Add(page);
                    _pagesToCrawl.Enqueue(item);
                }
            }
        }
Exemplo n.º 4
0
        public async Task <DomainCrawlStatistics> CrawlDomain(CancellationToken cancellationToken)
        {
            CancellationToken resultToken =
                CancellationTokenSource.CreateLinkedTokenSource(
                    cancellationToken,
                    _internalCTS.Token).Token;

            _contentFound = false;

            await Task.Factory.StartNew(
                async() =>
            {
                InitQueue();

                Directory.CreateDirectory(_domainDirectory);

                RobotsParams robotsParams = await _robotsReader.GetRobotsParams();

                while ((_pagesToCrawl.Count > 0 && ShouldCrawlNextPages() || _currentTasks.Count > 0))
                {
                    foreach (var task in _currentTasks.Where(t => t.IsCompleted).ToArray())
                    {
                        if (!task.IsFaulted && !task.IsCanceled)
                        {
                            var results = task.Result;

                            UpdateProgress(results.Item1);

                            if (results.Item2 < _configuration.MaxPageLevel)
                            {
                                EnqeueCrawlers(results.Item1.References, results.Item2);
                            }

                            _crawledUri.Add(task.Result.Item1.CrawledUri);
                            _queuedUri.Remove(task.Result.Item1.CrawledUri);
                        }

                        _currentTasks.Remove(task);
                    }

                    if (_currentTasks.Count >= _configuration.MaxTasks)
                    {
                        await Task.WhenAny(_currentTasks.ToArray());
                        continue;
                    }

                    if (_pagesToCrawl.Count > 0 &&
                        ShouldCrawlNextPages())
                    {
                        CrawlQueueItem currentCrawler = _pagesToCrawl.Dequeue();

                        _currentTasks.Add(CrawlPage(currentCrawler, resultToken));

                        if (robotsParams.CrawlDelay > 0)
                        {
                            await Task.Delay(robotsParams.CrawlDelay, resultToken);
                        }
                    }
                }


                SaveProgress();
            },
                TaskCreationOptions.LongRunning);

            return(_statistics);
        }