Example #1
0
        private void UpdateCrawlDelay(RobotsFile robotsFile, string userAgent, RequestProcessorOptions requestProcessorOptions)
        {
            //Apply Robots.txt crawl-delay (if defined)
            var userAgentEntry    = robotsFile.GetEntryForUserAgent(userAgent);
            var minimumCrawlDelay = userAgentEntry?.CrawlDelay ?? 0;
            var taskDelay         = Math.Max(minimumCrawlDelay * 1000, requestProcessorOptions.DelayBetweenRequestStart.TotalMilliseconds);

            requestProcessorOptions.DelayBetweenRequestStart = new TimeSpan(0, 0, 0, 0, (int)taskDelay);
        }
Example #2
0
        public void GithubTest()
        {
            var path = Path.Combine(testdataPath, "www.github.com-robots.txt");

            using (var robots = new RobotsFile(new Uri("https://github.com/robots.txt"), File.Open(path, FileMode.Open)))
            {
                // test different useragents
                Assert.IsTrue(robots.IsDisallowed(new Uri("https://github.com/nullabork/fetcho/blob/master/README.md"), userAgent));
                Assert.IsTrue(robots.IsNotDisallowed(new Uri("https://github.com/nullabork/fetcho/blob/master/README.md"), "Googlebot"));
            }
        }
Example #3
0
        public CrawlRunner(Uri baseUri, RobotsFile robotsFile, HttpClient httpClient, CrawlSettings crawlSettings, ILogger logger = null)
        {
            BaseUri    = baseUri;
            RobotsFile = robotsFile;
            HttpClient = httpClient;
            Settings   = crawlSettings;

            Logger           = logger;
            RobotsPageParser = new RobotsPageParser();

            AddRequest(baseUri);
        }
Example #4
0
        public void WikipediaTest()
        {
            var path = Path.Combine(testdataPath, "en.wikipedia.org-robots.txt");

            using (var robots = new RobotsFile(new Uri("https://en.wikipedia.org/robots.txt"), File.Open(path, FileMode.Open)))
            {
                Assert.IsTrue(!robots.IsDisallowed(new Uri("https://en.wikipedia.org/wiki/Main_Page")));
                Assert.IsTrue(!robots.IsDisallowed(new Uri("https://en.wikipedia.org/wiki/Event_Horizon_Telescope")));
                Assert.IsTrue(!robots.IsDisallowed(new Uri("https://en.wikipedia.org/wiki/Talk:Event_Horizon_Telescope")));
                Assert.IsTrue(robots.IsDisallowed(new Uri("https://en.wikipedia.org/w/index.php?title=Talk:Event_Horizon_Telescope&action=edit")));
                Assert.IsTrue(robots.IsDisallowed(new Uri("https://en.wikipedia.org/wiki/Special:Random")));
                Assert.IsTrue(robots.IsDisallowed(new Uri("https://en.wikipedia.org/w/index.php?title=Ahmet_Davutoglu&action=edit&section=34")));
            }
        }
Example #5
0
        public void R_UnusualRobotsRule()
        {
            const string txtfile = "User-agent: *\n\nDisallow: /news/0\n";
            Uri          uri     = new Uri("https://www.example.com/news/world-asia-40360168");
            Uri          uri2    = new Uri("https://www.example.com/040360168");
            var          buffer  = System.Text.Encoding.UTF8.GetBytes(txtfile);

            var robots = new RobotsFile(RobotsFetcher.MakeRobotsUri(uri), buffer);

            Console.WriteLine(robots.ToString());

            Assert.IsTrue(!robots.IsDisallowed(uri));
            Assert.IsTrue(!robots.IsDisallowed(uri2));
        }
Example #6
0
        public void DisallowedTest()
        {
            var txt    = "user-agent: *\n\ndisallow: /data/*\ndisallow: /daylight/$\ndisallow: /jerk\ndisallow: /h*ray.html$";
            var buffer = System.Text.Encoding.UTF8.GetBytes(txt);

            var robots = new RobotsFile(new Uri("https://www.example.com/robots.txt"), buffer);

            Assert.IsTrue(robots.IsDisallowed(new Uri("http://rofflo.org/jerk")));
            Assert.IsTrue(robots.IsDisallowed(new Uri("http://rofflo.org/data/hooray.html")));
            Assert.IsTrue(robots.IsDisallowed(new Uri("http://rofflo.org/hooray.html")));
            Assert.IsTrue(!robots.IsDisallowed(new Uri("http://rofflo.org/daylight/loafo.html")));
            Assert.IsTrue(robots.IsDisallowed(new Uri("http://rofflo.org/daylight/")));
            Assert.IsTrue(!robots.IsDisallowed(new Uri("http://rofflo.org/index.html")));
            Assert.IsTrue(!robots.IsDisallowed(new Uri("http://rofflo.org/")));
        }
Example #7
0
        public void SpeedTest()
        {
            // testing that a million URIs can be tested in < 14 seconds (ignoring setup)
            var path = Path.Combine(testdataPath, "en.wikipedia.org-robots.txt");

            using (var robots = new RobotsFile(new Uri("https://en.wikipedia.org/robots.txt"), File.Open(path, FileMode.Open)))
            {
                DateTime startTime = DateTime.Now;
                for (int i = 0; i < 1000000; i++)
                {
                    Assert.IsTrue(!robots.IsDisallowed(new Uri("https://en.wikipedia.org/wiki/Main_Page")));
                    Assert.IsTrue(!robots.IsDisallowed(new Uri("https://en.wikipedia.org/wiki/Event_Horizon_Telescope")));
                    Assert.IsTrue(!robots.IsDisallowed(new Uri("https://en.wikipedia.org/wiki/Talk:Event_Horizon_Telescope")));
                    Assert.IsTrue(robots.IsDisallowed(new Uri("https://en.wikipedia.org/w/index.php?title=Talk:Event_Horizon_Telescope&action=edit")));
                    Assert.IsTrue(robots.IsDisallowed(new Uri("https://en.wikipedia.org/wiki/Special:Random")));
                }
                var time = DateTime.Now - startTime;
                Assert.IsTrue(time.TotalSeconds < 14, time.ToString());
            }
        }
Example #8
0
        private void AddRequest(Uri requestUri, bool skipMaxPageCheck)
        {
            if (Settings.HostAliases != null)
            {
                if (!(requestUri.Host == BaseUri.Host || Settings.HostAliases.Contains(requestUri.Host)))
                {
                    Logger?.LogDebug($"{requestUri.Host} is not in the list of allowed hosts.");
                    return;
                }
            }
            else if (requestUri.Host != BaseUri.Host)
            {
                Logger?.LogDebug($"{requestUri.Host} doesn't match the base host.");
                return;
            }

            if (!skipMaxPageCheck && Settings.MaxNumberOfPagesToCrawl > 0)
            {
                var expectedCrawlCount = CrawledUris.Count + Settings.RequestProcessor.PendingRequests;
                if (expectedCrawlCount == Settings.MaxNumberOfPagesToCrawl)
                {
                    Logger?.LogDebug($"Page crawl limit blocks adding request for {requestUri}");
                    return;
                }
            }

            SeenUris.TryAdd(requestUri, 0);

            if (UriCrawlStates.TryGetValue(requestUri, out var crawlState))
            {
                var lastRequest = crawlState.Requests.LastOrDefault();
                if (lastRequest != null && lastRequest.IsSuccessfulStatus)
                {
                    return;
                }

                if (crawlState.Requests.Count() == Settings.NumberOfRetries)
                {
                    AddResult(new CrawledUri
                    {
                        Location      = crawlState.Location,
                        Status        = CrawlStatus.MaxRetries,
                        Requests      = crawlState.Requests,
                        RedirectChain = crawlState.Redirects
                    });
                    return;
                }

                if (crawlState.Redirects != null && crawlState.Redirects.Count == Settings.MaxNumberOfRedirects)
                {
                    AddResult(new CrawledUri
                    {
                        Location      = crawlState.Location,
                        RedirectChain = crawlState.Redirects,
                        Status        = CrawlStatus.MaxRedirects
                    });
                    return;
                }
            }

            if (RobotsFile.IsAllowedAccess(requestUri, Settings.UserAgent))
            {
                Settings.RequestProcessor.Add(requestUri);
            }
            else
            {
                AddResult(new CrawledUri
                {
                    Location = requestUri,
                    Status   = CrawlStatus.RobotsBlocked
                });
            }
        }