public void Run(CrawlerWorkerArgs args) { _parent = args.Parent; _callback = args.Function; if (_callback == null) { throw new ArgumentException("No callback was provided so no work would be done."); } _robots = args.Robots; _bannedExts = args.BannedExtensions; _visited = args.Visited; _visitedLock = args.VisitedLock; _sites = args.Sites; _respectRobots = args.RespectRobots; _userAgent = args.UserAgent; _maxDepth = args.MaxDepth; _id = args.ID; _web = new HtmlWeb(); _web.UserAgent = _userAgent; while (true) { Site next = null; if (!_sites.GetNextAvailableWorker(out next)) { Interlocked.Increment(ref _parent.PausedWorkers); if (_sites.Empty() && _parent.PausedWorkers == _parent.TotalWorkers) { return; } Thread.Sleep(500); Interlocked.Decrement(ref _parent.PausedWorkers); continue; } String url = next.Url; _visitedLock.EnterReadLock(); bool visited = _visited.Contains(url); _visitedLock.ExitReadLock(); if (next.Depth < _maxDepth && !visited) { VisitOneSite(next); } } }
private void RunWorker(object obj) { if (obj.GetType() != typeof(Tuple <CrawlerWorkerArgs, CrawlerWorker>)) { String error = "Incorrect arguments provided to RunWorker, can't run crawler."; _log.Fatal(error); throw new InvalidOperationException(error); } var tuple = (Tuple <CrawlerWorkerArgs, CrawlerWorker>)obj; CrawlerWorkerArgs args = tuple.Item1; CrawlerWorker worker = tuple.Item2; worker.Run(args); }
public void Crawl() { List <Thread> workerThreads = new List <Thread>(); for (int i = 0; i < _settings.WorkerCount; ++i) { ParameterizedThreadStart ts = new ParameterizedThreadStart(RunWorker); Thread temp = new Thread(ts); CrawlerWorkerArgs args = new CrawlerWorkerArgs() { Parent = this, Function = _settings.Function, Robots = _robots, Visited = _visited, VisitedLock = _visitedLock, Sites = _sites, RespectRobots = _settings.RespectRobots, UserAgent = _settings.UserAgent, MaxDepth = _settings.MaxDepth, BannedExtensions = _settings.BannedExtensions, ID = i, }; var functionArgs = Tuple.Create(args, new CrawlerWorker()); _log.Debug($"Starting worker {i}"); temp.Start(functionArgs); workerThreads.Add(temp); } foreach (Thread t in workerThreads) { t.Join(); } }