private void UpdateCrawlDelay(RobotsFile robotsFile, string userAgent, RequestProcessorOptions requestProcessorOptions) { //Apply Robots.txt crawl-delay (if defined) var userAgentEntry = robotsFile.GetEntryForUserAgent(userAgent); var minimumCrawlDelay = userAgentEntry?.CrawlDelay ?? 0; var taskDelay = Math.Max(minimumCrawlDelay * 1000, requestProcessorOptions.DelayBetweenRequestStart.TotalMilliseconds); requestProcessorOptions.DelayBetweenRequestStart = new TimeSpan(0, 0, 0, 0, (int)taskDelay); }
public void GithubTest() { var path = Path.Combine(testdataPath, "www.github.com-robots.txt"); using (var robots = new RobotsFile(new Uri("https://github.com/robots.txt"), File.Open(path, FileMode.Open))) { // test different useragents Assert.IsTrue(robots.IsDisallowed(new Uri("https://github.com/nullabork/fetcho/blob/master/README.md"), userAgent)); Assert.IsTrue(robots.IsNotDisallowed(new Uri("https://github.com/nullabork/fetcho/blob/master/README.md"), "Googlebot")); } }
public CrawlRunner(Uri baseUri, RobotsFile robotsFile, HttpClient httpClient, CrawlSettings crawlSettings, ILogger logger = null) { BaseUri = baseUri; RobotsFile = robotsFile; HttpClient = httpClient; Settings = crawlSettings; Logger = logger; RobotsPageParser = new RobotsPageParser(); AddRequest(baseUri); }
public void WikipediaTest() { var path = Path.Combine(testdataPath, "en.wikipedia.org-robots.txt"); using (var robots = new RobotsFile(new Uri("https://en.wikipedia.org/robots.txt"), File.Open(path, FileMode.Open))) { Assert.IsTrue(!robots.IsDisallowed(new Uri("https://en.wikipedia.org/wiki/Main_Page"))); Assert.IsTrue(!robots.IsDisallowed(new Uri("https://en.wikipedia.org/wiki/Event_Horizon_Telescope"))); Assert.IsTrue(!robots.IsDisallowed(new Uri("https://en.wikipedia.org/wiki/Talk:Event_Horizon_Telescope"))); Assert.IsTrue(robots.IsDisallowed(new Uri("https://en.wikipedia.org/w/index.php?title=Talk:Event_Horizon_Telescope&action=edit"))); Assert.IsTrue(robots.IsDisallowed(new Uri("https://en.wikipedia.org/wiki/Special:Random"))); Assert.IsTrue(robots.IsDisallowed(new Uri("https://en.wikipedia.org/w/index.php?title=Ahmet_Davutoglu&action=edit§ion=34"))); } }
public void R_UnusualRobotsRule() { const string txtfile = "User-agent: *\n\nDisallow: /news/0\n"; Uri uri = new Uri("https://www.example.com/news/world-asia-40360168"); Uri uri2 = new Uri("https://www.example.com/040360168"); var buffer = System.Text.Encoding.UTF8.GetBytes(txtfile); var robots = new RobotsFile(RobotsFetcher.MakeRobotsUri(uri), buffer); Console.WriteLine(robots.ToString()); Assert.IsTrue(!robots.IsDisallowed(uri)); Assert.IsTrue(!robots.IsDisallowed(uri2)); }
public void DisallowedTest() { var txt = "user-agent: *\n\ndisallow: /data/*\ndisallow: /daylight/$\ndisallow: /jerk\ndisallow: /h*ray.html$"; var buffer = System.Text.Encoding.UTF8.GetBytes(txt); var robots = new RobotsFile(new Uri("https://www.example.com/robots.txt"), buffer); Assert.IsTrue(robots.IsDisallowed(new Uri("http://rofflo.org/jerk"))); Assert.IsTrue(robots.IsDisallowed(new Uri("http://rofflo.org/data/hooray.html"))); Assert.IsTrue(robots.IsDisallowed(new Uri("http://rofflo.org/hooray.html"))); Assert.IsTrue(!robots.IsDisallowed(new Uri("http://rofflo.org/daylight/loafo.html"))); Assert.IsTrue(robots.IsDisallowed(new Uri("http://rofflo.org/daylight/"))); Assert.IsTrue(!robots.IsDisallowed(new Uri("http://rofflo.org/index.html"))); Assert.IsTrue(!robots.IsDisallowed(new Uri("http://rofflo.org/"))); }
public void SpeedTest() { // testing that a million URIs can be tested in < 14 seconds (ignoring setup) var path = Path.Combine(testdataPath, "en.wikipedia.org-robots.txt"); using (var robots = new RobotsFile(new Uri("https://en.wikipedia.org/robots.txt"), File.Open(path, FileMode.Open))) { DateTime startTime = DateTime.Now; for (int i = 0; i < 1000000; i++) { Assert.IsTrue(!robots.IsDisallowed(new Uri("https://en.wikipedia.org/wiki/Main_Page"))); Assert.IsTrue(!robots.IsDisallowed(new Uri("https://en.wikipedia.org/wiki/Event_Horizon_Telescope"))); Assert.IsTrue(!robots.IsDisallowed(new Uri("https://en.wikipedia.org/wiki/Talk:Event_Horizon_Telescope"))); Assert.IsTrue(robots.IsDisallowed(new Uri("https://en.wikipedia.org/w/index.php?title=Talk:Event_Horizon_Telescope&action=edit"))); Assert.IsTrue(robots.IsDisallowed(new Uri("https://en.wikipedia.org/wiki/Special:Random"))); } var time = DateTime.Now - startTime; Assert.IsTrue(time.TotalSeconds < 14, time.ToString()); } }
private void AddRequest(Uri requestUri, bool skipMaxPageCheck) { if (Settings.HostAliases != null) { if (!(requestUri.Host == BaseUri.Host || Settings.HostAliases.Contains(requestUri.Host))) { Logger?.LogDebug($"{requestUri.Host} is not in the list of allowed hosts."); return; } } else if (requestUri.Host != BaseUri.Host) { Logger?.LogDebug($"{requestUri.Host} doesn't match the base host."); return; } if (!skipMaxPageCheck && Settings.MaxNumberOfPagesToCrawl > 0) { var expectedCrawlCount = CrawledUris.Count + Settings.RequestProcessor.PendingRequests; if (expectedCrawlCount == Settings.MaxNumberOfPagesToCrawl) { Logger?.LogDebug($"Page crawl limit blocks adding request for {requestUri}"); return; } } SeenUris.TryAdd(requestUri, 0); if (UriCrawlStates.TryGetValue(requestUri, out var crawlState)) { var lastRequest = crawlState.Requests.LastOrDefault(); if (lastRequest != null && lastRequest.IsSuccessfulStatus) { return; } if (crawlState.Requests.Count() == Settings.NumberOfRetries) { AddResult(new CrawledUri { Location = crawlState.Location, Status = CrawlStatus.MaxRetries, Requests = crawlState.Requests, RedirectChain = crawlState.Redirects }); return; } if (crawlState.Redirects != null && crawlState.Redirects.Count == Settings.MaxNumberOfRedirects) { AddResult(new CrawledUri { Location = crawlState.Location, RedirectChain = crawlState.Redirects, Status = CrawlStatus.MaxRedirects }); return; } } if (RobotsFile.IsAllowedAccess(requestUri, Settings.UserAgent)) { Settings.RequestProcessor.Add(requestUri); } else { AddResult(new CrawledUri { Location = requestUri, Status = CrawlStatus.RobotsBlocked }); } }