Beispiel #1
0
        public void Run(CrawlerWorkerArgs args)
        {
            _parent   = args.Parent;
            _callback = args.Function;
            if (_callback == null)
            {
                throw new ArgumentException("No callback was provided so no work would be done.");
            }

            _robots        = args.Robots;
            _bannedExts    = args.BannedExtensions;
            _visited       = args.Visited;
            _visitedLock   = args.VisitedLock;
            _sites         = args.Sites;
            _respectRobots = args.RespectRobots;
            _userAgent     = args.UserAgent;
            _maxDepth      = args.MaxDepth;
            _id            = args.ID;

            _web           = new HtmlWeb();
            _web.UserAgent = _userAgent;

            while (true)
            {
                Site next = null;
                if (!_sites.GetNextAvailableWorker(out next))
                {
                    Interlocked.Increment(ref _parent.PausedWorkers);
                    if (_sites.Empty() && _parent.PausedWorkers == _parent.TotalWorkers)
                    {
                        return;
                    }

                    Thread.Sleep(500);
                    Interlocked.Decrement(ref _parent.PausedWorkers);

                    continue;
                }

                String url = next.Url;


                _visitedLock.EnterReadLock();
                bool visited = _visited.Contains(url);
                _visitedLock.ExitReadLock();

                if (next.Depth < _maxDepth && !visited)
                {
                    VisitOneSite(next);
                }
            }
        }
Beispiel #2
0
        public void UrlFound(string url)
        {
            _foundLock.EnterReadLock();
            bool found = _found.Contains(url);

            _foundLock.ExitReadLock();

            if (!found)
            {
                _foundLock.EnterWriteLock();
                _found.Add(url);
                _outFile.WriteLine(url);
                _outFile.Flush();
                _foundLock.ExitWriteLock();

                _log.Info($"!!! URL {url} !!!");
            }
        }