public RobotsParams ParseRobots(string robotsString) { RobotsParams robotParams = new RobotsParams(); using (var stringReader = new StringReader(robotsString)) { string line; while ((line = stringReader.ReadLine()) != null) { if (!string.IsNullOrEmpty(line)) { int position = line.IndexOf("crawl-delay", StringComparison.OrdinalIgnoreCase); if (position < 0) { continue; } int commentPosition = line.IndexOf("#", StringComparison.OrdinalIgnoreCase); if (commentPosition > 0 && commentPosition < position) { continue; } Regex delayRegex = new Regex("crawl-delay\\s*:\\s*(\\d+)", RegexOptions.IgnoreCase); Match match = delayRegex.Match(line); if (match.Success) { int delay = 0; if (int.TryParse(match.Groups[1].Value, out delay)) { robotParams.CrawlDelay = delay; } } } } } return(robotParams); }
public async Task <DomainCrawlStatistics> CrawlDomain(CancellationToken cancellationToken) { CancellationToken resultToken = CancellationTokenSource.CreateLinkedTokenSource( cancellationToken, _internalCTS.Token).Token; _contentFound = false; await Task.Factory.StartNew( async() => { InitQueue(); Directory.CreateDirectory(_domainDirectory); RobotsParams robotsParams = await _robotsReader.GetRobotsParams(); while ((_pagesToCrawl.Count > 0 && ShouldCrawlNextPages() || _currentTasks.Count > 0)) { foreach (var task in _currentTasks.Where(t => t.IsCompleted).ToArray()) { if (!task.IsFaulted && !task.IsCanceled) { var results = task.Result; UpdateProgress(results.Item1); if (results.Item2 < _configuration.MaxPageLevel) { EnqeueCrawlers(results.Item1.References, results.Item2); } _crawledUri.Add(task.Result.Item1.CrawledUri); _queuedUri.Remove(task.Result.Item1.CrawledUri); } _currentTasks.Remove(task); } if (_currentTasks.Count >= _configuration.MaxTasks) { await Task.WhenAny(_currentTasks.ToArray()); continue; } if (_pagesToCrawl.Count > 0 && ShouldCrawlNextPages()) { CrawlQueueItem currentCrawler = _pagesToCrawl.Dequeue(); _currentTasks.Add(CrawlPage(currentCrawler, resultToken)); if (robotsParams.CrawlDelay > 0) { await Task.Delay(robotsParams.CrawlDelay, resultToken); } } } SaveProgress(); }, TaskCreationOptions.LongRunning); return(_statistics); }