Example #1
0
        private void VisitOneSite(Site next)
        {
            Uri    uri  = new Uri(next.Url);
            string host = uri.Host;

            if (_respectRobots)
            {
                Robots config;
                if (!_robots.TryGetValue(host, out config))
                {
                    // TODO: actually get the robots.txt
                    config = new Robots(host, _userAgent);

                    _robots.TryAdd(host, config);
                }

                if (!config.Allowed(uri))
                {
                    return;
                }
            }

            _visitedLock.EnterWriteLock();
            _visited.Add(next.Url);
            _visitedLock.ExitWriteLock();

            try
            {
                HtmlDocument doc = _web.Load(uri);

                List <string> found;
                List <Uri>    nextSites;
                _callback(doc, uri, out found, out nextSites);

                foreach (string f in found)
                {
                    _parent.UrlFound(f);
                }

                foreach (Uri link in nextSites)
                {
                    Site temp = new Site()
                    {
                        Url   = link.AbsoluteUri,
                        Depth = next.Depth + 1
                    };

                    _sites.Enqueue(temp);
                }
            }
            catch (Exception e)
            {
                _log.Debug($"Worker {_id}: Error visiting site {uri.AbsolutePath}, exception message {e.Message}.");
            }
        }
Example #2
0
        public void UrlFound(string url)
        {
            _foundLock.EnterReadLock();
            bool found = _found.Contains(url);

            _foundLock.ExitReadLock();

            if (!found)
            {
                _foundLock.EnterWriteLock();
                _found.Add(url);
                _outFile.WriteLine(url);
                _outFile.Flush();
                _foundLock.ExitWriteLock();

                _log.Info($"!!! URL {url} !!!");
            }
        }