public void Run(CrawlerWorkerArgs args) { _parent = args.Parent; _callback = args.Function; if (_callback == null) { throw new ArgumentException("No callback was provided so no work would be done."); } _robots = args.Robots; _bannedExts = args.BannedExtensions; _visited = args.Visited; _visitedLock = args.VisitedLock; _sites = args.Sites; _respectRobots = args.RespectRobots; _userAgent = args.UserAgent; _maxDepth = args.MaxDepth; _id = args.ID; _web = new HtmlWeb(); _web.UserAgent = _userAgent; while (true) { Site next = null; if (!_sites.GetNextAvailableWorker(out next)) { Interlocked.Increment(ref _parent.PausedWorkers); if (_sites.Empty() && _parent.PausedWorkers == _parent.TotalWorkers) { return; } Thread.Sleep(500); Interlocked.Decrement(ref _parent.PausedWorkers); continue; } String url = next.Url; _visitedLock.EnterReadLock(); bool visited = _visited.Contains(url); _visitedLock.ExitReadLock(); if (next.Depth < _maxDepth && !visited) { VisitOneSite(next); } } }
public void UrlFound(string url) { _foundLock.EnterReadLock(); bool found = _found.Contains(url); _foundLock.ExitReadLock(); if (!found) { _foundLock.EnterWriteLock(); _found.Add(url); _outFile.WriteLine(url); _outFile.Flush(); _foundLock.ExitWriteLock(); _log.Info($"!!! URL {url} !!!"); } }