Beispiel #1
0
        public async Task StartCrawlingAsync(IEnumerable <WebCrawlerItem> items)
        {
            _crawlingDomains.Clear();

            foreach (var webCrawlerItem in items)
            {
                DomainCrawlerConfiguration config = webCrawlerItem.Configuration;
                _domainsToCrawl.Enqueue(new DomainCrawler(webCrawlerItem.Domain, config ?? new DomainCrawlerConfiguration()));
            }

            while (_domainsToCrawl.Count > 0)
            {
                // Вырисовывается копипаста из краулера домена.
                if (_crawlingDomains.Count >= _configuration.MaxCrawlDomains)
                {
                    _crawlingDomains.RemoveWhere(d => d.IsCompleted);

                    if (_crawlingDomains.Count >= _configuration.MaxCrawlDomains)
                    {
                        await Task.WhenAny(_crawlingDomains.ToArray());
                    }

                    continue;
                }

                DomainCrawler crawler = _domainsToCrawl.Dequeue();
                _crawlingDomains.Add(crawler.CrawlDomain(_cts.Token));
            }

            await Task.WhenAll(_crawlingDomains.ToArray());
        }
        public WebCrawlerItem(Uri domain, DomainCrawlerConfiguration configuration)
        {
            if (domain == null)
            {
                throw new ArgumentNullException("domain");
            }

            Configuration = configuration ?? throw new ArgumentNullException(nameof(configuration));
            Domain        = domain;
        }
        public DomainCrawler(Uri siteUri, DomainCrawlerConfiguration configuration)
        {
            if (siteUri == null)
            {
                throw new ArgumentNullException("siteUri");
            }

            _siteUri = siteUri;

            _robotsReader = new RobotsReader(_siteUri);

            _configuration = configuration;

            var sb = new StringBuilder(_siteUri.Host);

            _domainDirectory = sb.Replace("/", "-").Replace(":", "-").ToString();
        }