private void VisitOneSite(Site next) { Uri uri = new Uri(next.Url); string host = uri.Host; if (_respectRobots) { Robots config; if (!_robots.TryGetValue(host, out config)) { // TODO: actually get the robots.txt config = new Robots(host, _userAgent); _robots.TryAdd(host, config); } if (!config.Allowed(uri)) { return; } } _visitedLock.EnterWriteLock(); _visited.Add(next.Url); _visitedLock.ExitWriteLock(); try { HtmlDocument doc = _web.Load(uri); List <string> found; List <Uri> nextSites; _callback(doc, uri, out found, out nextSites); foreach (string f in found) { _parent.UrlFound(f); } foreach (Uri link in nextSites) { Site temp = new Site() { Url = link.AbsoluteUri, Depth = next.Depth + 1 }; _sites.Enqueue(temp); } } catch (Exception e) { _log.Debug($"Worker {_id}: Error visiting site {uri.AbsolutePath}, exception message {e.Message}."); } }
public Crawler(CrawlerSettings settings) { _settings = settings; TotalWorkers = _settings.WorkerCount; _sites = new CrawlerQueue(_settings.RespectRobots, _settings.UserAgent); _outFile = new StreamWriter(new FileStream(_settings.OutputPath, FileMode.Create)); foreach (String str in _settings.Seeds) { Site s = new Site() { Url = str, Depth = 0 }; _sites.Enqueue(s); } }