private void VisitOneSite(Site next) { Uri uri = new Uri(next.Url); string host = uri.Host; if (_respectRobots) { Robots config; if (!_robots.TryGetValue(host, out config)) { // TODO: actually get the robots.txt config = new Robots(host, _userAgent); _robots.TryAdd(host, config); } if (!config.Allowed(uri)) { return; } } _visitedLock.EnterWriteLock(); _visited.Add(next.Url); _visitedLock.ExitWriteLock(); try { HtmlDocument doc = _web.Load(uri); List <string> found; List <Uri> nextSites; _callback(doc, uri, out found, out nextSites); foreach (string f in found) { _parent.UrlFound(f); } foreach (Uri link in nextSites) { Site temp = new Site() { Url = link.AbsoluteUri, Depth = next.Depth + 1 }; _sites.Enqueue(temp); } } catch (Exception e) { _log.Debug($"Worker {_id}: Error visiting site {uri.AbsolutePath}, exception message {e.Message}."); } }
public void UrlFound(string url) { _foundLock.EnterReadLock(); bool found = _found.Contains(url); _foundLock.ExitReadLock(); if (!found) { _foundLock.EnterWriteLock(); _found.Add(url); _outFile.WriteLine(url); _outFile.Flush(); _foundLock.ExitWriteLock(); _log.Info($"!!! URL {url} !!!"); } }