Esempio n. 1
0
        public void Run(CrawlerWorkerArgs args)
        {
            _parent   = args.Parent;
            _callback = args.Function;
            if (_callback == null)
            {
                throw new ArgumentException("No callback was provided so no work would be done.");
            }

            _robots        = args.Robots;
            _bannedExts    = args.BannedExtensions;
            _visited       = args.Visited;
            _visitedLock   = args.VisitedLock;
            _sites         = args.Sites;
            _respectRobots = args.RespectRobots;
            _userAgent     = args.UserAgent;
            _maxDepth      = args.MaxDepth;
            _id            = args.ID;

            _web           = new HtmlWeb();
            _web.UserAgent = _userAgent;

            while (true)
            {
                Site next = null;
                if (!_sites.GetNextAvailableWorker(out next))
                {
                    Interlocked.Increment(ref _parent.PausedWorkers);
                    if (_sites.Empty() && _parent.PausedWorkers == _parent.TotalWorkers)
                    {
                        return;
                    }

                    Thread.Sleep(500);
                    Interlocked.Decrement(ref _parent.PausedWorkers);

                    continue;
                }

                String url = next.Url;


                _visitedLock.EnterReadLock();
                bool visited = _visited.Contains(url);
                _visitedLock.ExitReadLock();

                if (next.Depth < _maxDepth && !visited)
                {
                    VisitOneSite(next);
                }
            }
        }
Esempio n. 2
0
        private void RunWorker(object obj)
        {
            if (obj.GetType() != typeof(Tuple <CrawlerWorkerArgs, CrawlerWorker>))
            {
                String error = "Incorrect arguments provided to RunWorker, can't run crawler.";
                _log.Fatal(error);
                throw new InvalidOperationException(error);
            }

            var tuple = (Tuple <CrawlerWorkerArgs, CrawlerWorker>)obj;
            CrawlerWorkerArgs args   = tuple.Item1;
            CrawlerWorker     worker = tuple.Item2;

            worker.Run(args);
        }
Esempio n. 3
0
        public void Crawl()
        {
            List <Thread> workerThreads = new List <Thread>();

            for (int i = 0; i < _settings.WorkerCount; ++i)
            {
                ParameterizedThreadStart ts = new ParameterizedThreadStart(RunWorker);

                Thread            temp = new Thread(ts);
                CrawlerWorkerArgs args = new CrawlerWorkerArgs()
                {
                    Parent           = this,
                    Function         = _settings.Function,
                    Robots           = _robots,
                    Visited          = _visited,
                    VisitedLock      = _visitedLock,
                    Sites            = _sites,
                    RespectRobots    = _settings.RespectRobots,
                    UserAgent        = _settings.UserAgent,
                    MaxDepth         = _settings.MaxDepth,
                    BannedExtensions = _settings.BannedExtensions,
                    ID = i,
                };

                var functionArgs = Tuple.Create(args, new CrawlerWorker());
                _log.Debug($"Starting worker {i}");
                temp.Start(functionArgs);

                workerThreads.Add(temp);
            }

            foreach (Thread t in workerThreads)
            {
                t.Join();
            }
        }