Example #1
0
        public void IsUrlAllowed_EmptyRobotsContent_ReturnsTrue()
        {
            _unitUnderTest = new RobotsDotText(_rootUri, "");

            //Should use "*" user agent by default
            string userAgent = _userAgentString;

            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri, userAgent));
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "allowedfolder/aa.html", userAgent));
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "allowedfolder/bb.html", userAgent));
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "allowedfile2", userAgent));

            //User agent "userAgentCrawlDelayIs1" doesn't specify anything to disallow so should allow all ("*" is not inherited)
            userAgent = "userAgentCrawlDelayIs1";
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri, userAgent));
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "disallowedfile.txt", userAgent));
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "disallowedfolder", userAgent));
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "disallowedfolder/", userAgent));
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "disallowedfolder/subfolder", userAgent));
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "disallowedfolder/subfolder/", userAgent));

            //Allows all since "userAgentCrawlDelayIs1" does not specify allow or disallow
            userAgent = "userAgentCrawlDelayIs1";
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri, userAgent));
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "allowedfolder/aa.html", userAgent));
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "allowedfolder/bb.html", userAgent));
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "allowedfile2", userAgent));
        }
 public void SetUp()
 {
     _unitUnderTest = new RobotsDotText(_rootUri, _robotsContent);
     _realPage      = new PageRequester(new CrawlConfiguration {
         UserAgentString = "aaa"
     }).MakeRequest(new Uri("http://localhost:1111/"));
 }
 public void Constructor_NullRootUri()
 {
     Assert.Throws <ArgumentNullException>(() =>
     {
         _unitUnderTest = new RobotsDotText(null, _robotsContent);
     });
 }
 public void Constructor_NullContent()
 {
     Assert.Throws <ArgumentNullException>(() =>
     {
         string nullContent = null;
         _unitUnderTest     = new RobotsDotText(_rootUri, nullContent);
     });
 }
Example #5
0
        public void IsUrlAllowed_WildCardAgentWithWhiteSpaceDisallow_ReturnsTrue()
        {
            string userAgentString = _userAgentString;

            _unitUnderTest = new RobotsDotText(_rootUri, @"User-agent: *
Disallow: ");
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri, userAgentString));
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "aa.html", userAgentString));
        }
Example #6
0
        [Test, Ignore]//This is a bug and needs to be fixed
        public void IsUrlAllowed_QuerystringOnRoot2_ReturnsTrue()
        {
            string userAgentString = _userAgentString;

            _unitUnderTest = new RobotsDotText(_rootUri, @"User-Agent: *
Disallow: /?/
Disallow: /category/");

            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri, userAgentString));
        }
Example #7
0
        public void IsUrlAllowed_QuerystringOnRoot_ReturnsTrue()
        {
            string userAgentString = _userAgentString;

            _unitUnderTest = new RobotsDotText(_rootUri, @"User-Agent: *
Disallow: /?category=whatever
Disallow: /?category=another&color=red");

            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri, userAgentString));
        }
Example #8
0
        public void IsUserAgentAllowed_WildCardUserAgent_ReturnsFalse()
        {
            string content = @"
User-Agent: *
Disallow: /";

            _unitUnderTest = new RobotsDotText(_rootUri, content);

            Assert.IsFalse(_unitUnderTest.IsUserAgentAllowed("aaaaaaaaaaaa"));
        }
Example #9
0
        public void IsUrlAllowed_QuerystringMatch_NotSupported_ReturnsFalse()
        {
            string userAgentString = _userAgentString;

            _unitUnderTest = new RobotsDotText(_rootUri, @"User-Agent: *
Disallow: /?category=whatever
Disallow: /?category=another&color=red");

            Assert.IsFalse(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "?category=whatever", userAgentString));
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "?category=another&blah=blah", userAgentString));
        }
Example #10
0
        public void IsUrlAllowed_QuerystringMatch_NotSupported_ReturnsTrue()
        {
            //IF this test starts failing that is a good thing, it means the robots impl now supports querystrings
            string userAgentString = _userAgentString;

            _unitUnderTest = new RobotsDotText(_rootUri, @"User-Agent: *
Disallow: /?category=whatever
Disallow: /?category=another&color=red");

            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "?category=whatever", userAgentString));
            Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "?category=another&blah=blah", userAgentString));
        }
Example #11
0
        /// <summary>
        ///     Parses the robots dot text.
        /// </summary>
        /// <param name = "baseUri">The base URI.</param>
        /// <param name = "robotsDotTextSource">The robots dot text source.</param>
        /// <returns></returns>
        public override RobotsDotText ParseRobotsDotTextSource(Uri baseUri, byte[] robotsDotTextSource)
        {
            RobotsDotText robotsDotText = new RobotsDotText();

            robotsDotText.DisallowedPaths = new List <string>();

            if (robotsDotTextSource != null)
            {
                using (StreamReader streamReader = new StreamReader(new MemoryStream(robotsDotTextSource)))
                {
                    string currentUserAgent     = string.Empty;
                    bool   addToDisallowedPaths = false;

                    while (!streamReader.EndOfStream)
                    {
                        string originalLine            = streamReader.ReadLine();
                        string lineForSyntaxEvaluation = originalLine.ToLowerInvariant().Trim();

                        if (lineForSyntaxEvaluation.StartsWith("#") || string.IsNullOrEmpty(lineForSyntaxEvaluation))
                        {
                            continue;
                        }

                        if (lineForSyntaxEvaluation.StartsWith("crawl-delay:"))
                        {
                            if (currentUserAgent.Replace("user-agent:", string.Empty).Trim() == "*" || currentUserAgent.Contains(ApplicationSettings.UserAgent.ToLowerInvariant()))
                            {
                                lineForSyntaxEvaluation = lineForSyntaxEvaluation.Replace("crawl-delay:", string.Empty);

                                int crawlDelay;

                                if (int.TryParse(lineForSyntaxEvaluation, out crawlDelay))
                                {
                                    robotsDotText.CrawlDelay = crawlDelay;
                                }
                            }

                            continue;
                        }

                        if (lineForSyntaxEvaluation.StartsWith("user-agent:"))
                        {
                            if (lineForSyntaxEvaluation.Replace("user-agent:", string.Empty).Trim() == "*" || lineForSyntaxEvaluation.Contains(ApplicationSettings.UserAgent.ToLowerInvariant()))
                            {
                                currentUserAgent = lineForSyntaxEvaluation.Replace("user-agent:", string.Empty).Trim();

                                addToDisallowedPaths = true;
                            }
                            else
                            {
                                currentUserAgent = string.Empty;

                                addToDisallowedPaths = false;
                            }

                            continue;
                        }

                        if (addToDisallowedPaths)
                        {
                            if (lineForSyntaxEvaluation.StartsWith("disallow:"))
                            {
                                lineForSyntaxEvaluation = Regex.Replace(originalLine, "disallow:", string.Empty, RegexOptions.IgnoreCase).Trim();

                                if (!string.IsNullOrEmpty(lineForSyntaxEvaluation))
                                {
                                    Uri uri;
                                    if (Uri.TryCreate(baseUri, lineForSyntaxEvaluation, out uri))
                                    {
                                        if (!robotsDotText.DisallowedPaths.Contains(uri.AbsoluteUri))
                                        {
                                            robotsDotText.DisallowedPaths.Add(uri.AbsoluteUri);
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }

            return(robotsDotText);
        }
Example #12
0
        public void Constructor_NullContent()
        {
            string nullContent = null;

            _unitUnderTest = new RobotsDotText(_rootUri, nullContent);
        }
Example #13
0
 public void Constructor_NullRootUri()
 {
     _unitUnderTest = new RobotsDotText(null, _robotsContent);
 }
Example #14
0
 public void SetUp()
 {
     _unitUnderTest = new RobotsDotText(_rootUri, _robotsContent);
 }