public void WriteFile(WebPage page) { if (_fileToDomainMap.ContainsKey(page.Uri.AbsoluteUri)) { return; } Guid guid = Guid.NewGuid(); string fileName = guid.ToString() + ".html"; page.SavePage(Path.Combine(_outputPath, fileName)); _fileToDomainMap.TryAdd(page.Uri.AbsoluteUri, fileName); }
private bool EnsurePoliteVisit(WebPage webpage) { // Check robotstxt to see if we can visit the page string hostname = webpage.Uri.DnsSafeHost; RobotsTxt robotsTxt; if (!_robotsTxts.ContainsKey(hostname.GetHashCode())) { robotsTxt = _parser.Parse(hostname); if (robotsTxt == null) // we could not get the robotstxt therefore we cannot visit and we requeue the Uri { _urlFrontier.AddUri(webpage.Uri); return false; } _robotsTxts.TryAdd(hostname.GetHashCode(), robotsTxt); } else { robotsTxt = _robotsTxts[hostname.GetHashCode()]; } if (!robotsTxt.CanVisit("*", webpage.Uri)) // We are not allowed to visit this page return false; // We are allowed to visit, ensure we are waiting enough time before next request int address = webpage.Address; if (_visitedServers.ContainsKey(address)) { DateTime now = DateTime.Now; DateTime safeVisit = new DateTime(_visitedServers[address]); int delay = (int) safeVisit.Subtract(now).TotalMilliseconds; if (delay > 0) { Thread.Sleep(delay); } } return true; }
private void UpdateTimestamp(WebPage webpage) { DateTime nextValidVisitTime = DateTime.Now.AddSeconds(TIME_BETWEEN_VISITS); if (_visitedServers.ContainsKey(webpage.Address)) { _visitedServers[webpage.Address] = nextValidVisitTime.Ticks; } else { _visitedServers.TryAdd(webpage.Address, nextValidVisitTime.Ticks); } }
private void ProcessUri(ParallelOptions options, Uri uri) { if (uri == null) return; WebPage webpage = new WebPage(uri); // Is it safe to visit the webpage? if (!EnsurePoliteVisit(webpage)) return; // Visit webpage webpage.LoadPage(); // make sure to update or add time for visit to the dictionary UpdateTimestamp(webpage); // Add webpage anchors to the queue if (!webpage.IsLoaded) return; if (options.CancellationToken.IsCancellationRequested) { Thread.CurrentThread.Abort(); } _store.WriteFile(webpage); _statistics.IncrementPagesCrawled(); // Add extracted anchors to the queue _urlFrontier.AddUriRange(webpage.GetAnchors()); }