Beispiel #1
0
        private void VisitOneSite(Site next)
        {
            Uri    uri  = new Uri(next.Url);
            string host = uri.Host;

            if (_respectRobots)
            {
                Robots config;
                if (!_robots.TryGetValue(host, out config))
                {
                    // TODO: actually get the robots.txt
                    config = new Robots(host, _userAgent);

                    _robots.TryAdd(host, config);
                }

                if (!config.Allowed(uri))
                {
                    return;
                }
            }

            _visitedLock.EnterWriteLock();
            _visited.Add(next.Url);
            _visitedLock.ExitWriteLock();

            try
            {
                HtmlDocument doc = _web.Load(uri);

                List <string> found;
                List <Uri>    nextSites;
                _callback(doc, uri, out found, out nextSites);

                foreach (string f in found)
                {
                    _parent.UrlFound(f);
                }

                foreach (Uri link in nextSites)
                {
                    Site temp = new Site()
                    {
                        Url   = link.AbsoluteUri,
                        Depth = next.Depth + 1
                    };

                    _sites.Enqueue(temp);
                }
            }
            catch (Exception e)
            {
                _log.Debug($"Worker {_id}: Error visiting site {uri.AbsolutePath}, exception message {e.Message}.");
            }
        }
Beispiel #2
0
        public Crawler(CrawlerSettings settings)
        {
            _settings    = settings;
            TotalWorkers = _settings.WorkerCount;
            _sites       = new CrawlerQueue(_settings.RespectRobots, _settings.UserAgent);
            _outFile     = new StreamWriter(new FileStream(_settings.OutputPath, FileMode.Create));

            foreach (String str in _settings.Seeds)
            {
                Site s = new Site()
                {
                    Url   = str,
                    Depth = 0
                };

                _sites.Enqueue(s);
            }
        }