public void Run(CrawlerWorkerArgs args) { _parent = args.Parent; _callback = args.Function; if (_callback == null) { throw new ArgumentException("No callback was provided so no work would be done."); } _robots = args.Robots; _bannedExts = args.BannedExtensions; _visited = args.Visited; _visitedLock = args.VisitedLock; _sites = args.Sites; _respectRobots = args.RespectRobots; _userAgent = args.UserAgent; _maxDepth = args.MaxDepth; _id = args.ID; _web = new HtmlWeb(); _web.UserAgent = _userAgent; while (true) { Site next = null; if (!_sites.GetNextAvailableWorker(out next)) { Interlocked.Increment(ref _parent.PausedWorkers); if (_sites.Empty() && _parent.PausedWorkers == _parent.TotalWorkers) { return; } Thread.Sleep(500); Interlocked.Decrement(ref _parent.PausedWorkers); continue; } String url = next.Url; _visitedLock.EnterReadLock(); bool visited = _visited.Contains(url); _visitedLock.ExitReadLock(); if (next.Depth < _maxDepth && !visited) { VisitOneSite(next); } } }
public Crawler(CrawlerSettings settings) { _settings = settings; TotalWorkers = _settings.WorkerCount; _sites = new CrawlerQueue(_settings.RespectRobots, _settings.UserAgent); _outFile = new StreamWriter(new FileStream(_settings.OutputPath, FileMode.Create)); foreach (String str in _settings.Seeds) { Site s = new Site() { Url = str, Depth = 0 }; _sites.Enqueue(s); } }