public async Task StartCrawlingAsync(IEnumerable <WebCrawlerItem> items) { _crawlingDomains.Clear(); foreach (var webCrawlerItem in items) { DomainCrawlerConfiguration config = webCrawlerItem.Configuration; _domainsToCrawl.Enqueue(new DomainCrawler(webCrawlerItem.Domain, config ?? new DomainCrawlerConfiguration())); } while (_domainsToCrawl.Count > 0) { // Вырисовывается копипаста из краулера домена. if (_crawlingDomains.Count >= _configuration.MaxCrawlDomains) { _crawlingDomains.RemoveWhere(d => d.IsCompleted); if (_crawlingDomains.Count >= _configuration.MaxCrawlDomains) { await Task.WhenAny(_crawlingDomains.ToArray()); } continue; } DomainCrawler crawler = _domainsToCrawl.Dequeue(); _crawlingDomains.Add(crawler.CrawlDomain(_cts.Token)); } await Task.WhenAll(_crawlingDomains.ToArray()); }
public WebCrawlerItem(Uri domain, DomainCrawlerConfiguration configuration) { if (domain == null) { throw new ArgumentNullException("domain"); } Configuration = configuration ?? throw new ArgumentNullException(nameof(configuration)); Domain = domain; }
public DomainCrawler(Uri siteUri, DomainCrawlerConfiguration configuration) { if (siteUri == null) { throw new ArgumentNullException("siteUri"); } _siteUri = siteUri; _robotsReader = new RobotsReader(_siteUri); _configuration = configuration; var sb = new StringBuilder(_siteUri.Host); _domainDirectory = sb.Replace("/", "-").Replace(":", "-").ToString(); }