Ejemplo n.º 1
0
        public RobotsParams ParseRobots(string robotsString)
        {
            RobotsParams robotParams = new RobotsParams();

            using (var stringReader = new StringReader(robotsString))
            {
                string line;
                while ((line = stringReader.ReadLine()) != null)
                {
                    if (!string.IsNullOrEmpty(line))
                    {
                        int position = line.IndexOf("crawl-delay", StringComparison.OrdinalIgnoreCase);

                        if (position < 0)
                        {
                            continue;
                        }

                        int commentPosition = line.IndexOf("#", StringComparison.OrdinalIgnoreCase);

                        if (commentPosition > 0 && commentPosition < position)
                        {
                            continue;
                        }

                        Regex delayRegex = new Regex("crawl-delay\\s*:\\s*(\\d+)", RegexOptions.IgnoreCase);

                        Match match = delayRegex.Match(line);

                        if (match.Success)
                        {
                            int delay = 0;

                            if (int.TryParse(match.Groups[1].Value, out delay))
                            {
                                robotParams.CrawlDelay = delay;
                            }
                        }
                    }
                }
            }

            return(robotParams);
        }
Ejemplo n.º 2
0
        public async Task <DomainCrawlStatistics> CrawlDomain(CancellationToken cancellationToken)
        {
            CancellationToken resultToken =
                CancellationTokenSource.CreateLinkedTokenSource(
                    cancellationToken,
                    _internalCTS.Token).Token;

            _contentFound = false;

            await Task.Factory.StartNew(
                async() =>
            {
                InitQueue();

                Directory.CreateDirectory(_domainDirectory);

                RobotsParams robotsParams = await _robotsReader.GetRobotsParams();

                while ((_pagesToCrawl.Count > 0 && ShouldCrawlNextPages() || _currentTasks.Count > 0))
                {
                    foreach (var task in _currentTasks.Where(t => t.IsCompleted).ToArray())
                    {
                        if (!task.IsFaulted && !task.IsCanceled)
                        {
                            var results = task.Result;

                            UpdateProgress(results.Item1);

                            if (results.Item2 < _configuration.MaxPageLevel)
                            {
                                EnqeueCrawlers(results.Item1.References, results.Item2);
                            }

                            _crawledUri.Add(task.Result.Item1.CrawledUri);
                            _queuedUri.Remove(task.Result.Item1.CrawledUri);
                        }

                        _currentTasks.Remove(task);
                    }

                    if (_currentTasks.Count >= _configuration.MaxTasks)
                    {
                        await Task.WhenAny(_currentTasks.ToArray());
                        continue;
                    }

                    if (_pagesToCrawl.Count > 0 &&
                        ShouldCrawlNextPages())
                    {
                        CrawlQueueItem currentCrawler = _pagesToCrawl.Dequeue();

                        _currentTasks.Add(CrawlPage(currentCrawler, resultToken));

                        if (robotsParams.CrawlDelay > 0)
                        {
                            await Task.Delay(robotsParams.CrawlDelay, resultToken);
                        }
                    }
                }


                SaveProgress();
            },
                TaskCreationOptions.LongRunning);

            return(_statistics);
        }