Exemple #1
0
        public void Run(CrawlerWorkerArgs args)
        {
            _parent   = args.Parent;
            _callback = args.Function;
            if (_callback == null)
            {
                throw new ArgumentException("No callback was provided so no work would be done.");
            }

            _robots        = args.Robots;
            _bannedExts    = args.BannedExtensions;
            _visited       = args.Visited;
            _visitedLock   = args.VisitedLock;
            _sites         = args.Sites;
            _respectRobots = args.RespectRobots;
            _userAgent     = args.UserAgent;
            _maxDepth      = args.MaxDepth;
            _id            = args.ID;

            _web           = new HtmlWeb();
            _web.UserAgent = _userAgent;

            while (true)
            {
                Site next = null;
                if (!_sites.GetNextAvailableWorker(out next))
                {
                    Interlocked.Increment(ref _parent.PausedWorkers);
                    if (_sites.Empty() && _parent.PausedWorkers == _parent.TotalWorkers)
                    {
                        return;
                    }

                    Thread.Sleep(500);
                    Interlocked.Decrement(ref _parent.PausedWorkers);

                    continue;
                }

                String url = next.Url;


                _visitedLock.EnterReadLock();
                bool visited = _visited.Contains(url);
                _visitedLock.ExitReadLock();

                if (next.Depth < _maxDepth && !visited)
                {
                    VisitOneSite(next);
                }
            }
        }
Exemple #2
0
        public Crawler(CrawlerSettings settings)
        {
            _settings    = settings;
            TotalWorkers = _settings.WorkerCount;
            _sites       = new CrawlerQueue(_settings.RespectRobots, _settings.UserAgent);
            _outFile     = new StreamWriter(new FileStream(_settings.OutputPath, FileMode.Create));

            foreach (String str in _settings.Seeds)
            {
                Site s = new Site()
                {
                    Url   = str,
                    Depth = 0
                };

                _sites.Enqueue(s);
            }
        }