public void IsUrlAllowed_EmptyRobotsContent_ReturnsTrue() { _unitUnderTest = new RobotsDotText(_rootUri, ""); //Should use "*" user agent by default string userAgent = _userAgentString; Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri, userAgent)); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "allowedfolder/aa.html", userAgent)); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "allowedfolder/bb.html", userAgent)); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "allowedfile2", userAgent)); //User agent "userAgentCrawlDelayIs1" doesn't specify anything to disallow so should allow all ("*" is not inherited) userAgent = "userAgentCrawlDelayIs1"; Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri, userAgent)); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "disallowedfile.txt", userAgent)); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "disallowedfolder", userAgent)); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "disallowedfolder/", userAgent)); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "disallowedfolder/subfolder", userAgent)); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "disallowedfolder/subfolder/", userAgent)); //Allows all since "userAgentCrawlDelayIs1" does not specify allow or disallow userAgent = "userAgentCrawlDelayIs1"; Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri, userAgent)); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "allowedfolder/aa.html", userAgent)); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "allowedfolder/bb.html", userAgent)); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "allowedfile2", userAgent)); }
public void SetUp() { _unitUnderTest = new RobotsDotText(_rootUri, _robotsContent); _realPage = new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequest(new Uri("http://localhost:1111/")); }
public void Constructor_NullRootUri() { Assert.Throws <ArgumentNullException>(() => { _unitUnderTest = new RobotsDotText(null, _robotsContent); }); }
public void Constructor_NullContent() { Assert.Throws <ArgumentNullException>(() => { string nullContent = null; _unitUnderTest = new RobotsDotText(_rootUri, nullContent); }); }
public void IsUrlAllowed_WildCardAgentWithWhiteSpaceDisallow_ReturnsTrue() { string userAgentString = _userAgentString; _unitUnderTest = new RobotsDotText(_rootUri, @"User-agent: * Disallow: "); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri, userAgentString)); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "aa.html", userAgentString)); }
[Test, Ignore]//This is a bug and needs to be fixed public void IsUrlAllowed_QuerystringOnRoot2_ReturnsTrue() { string userAgentString = _userAgentString; _unitUnderTest = new RobotsDotText(_rootUri, @"User-Agent: * Disallow: /?/ Disallow: /category/"); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri, userAgentString)); }
public void IsUrlAllowed_QuerystringOnRoot_ReturnsTrue() { string userAgentString = _userAgentString; _unitUnderTest = new RobotsDotText(_rootUri, @"User-Agent: * Disallow: /?category=whatever Disallow: /?category=another&color=red"); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri, userAgentString)); }
public void IsUserAgentAllowed_WildCardUserAgent_ReturnsFalse() { string content = @" User-Agent: * Disallow: /"; _unitUnderTest = new RobotsDotText(_rootUri, content); Assert.IsFalse(_unitUnderTest.IsUserAgentAllowed("aaaaaaaaaaaa")); }
public void IsUrlAllowed_QuerystringMatch_NotSupported_ReturnsFalse() { string userAgentString = _userAgentString; _unitUnderTest = new RobotsDotText(_rootUri, @"User-Agent: * Disallow: /?category=whatever Disallow: /?category=another&color=red"); Assert.IsFalse(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "?category=whatever", userAgentString)); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "?category=another&blah=blah", userAgentString)); }
public void IsUrlAllowed_QuerystringMatch_NotSupported_ReturnsTrue() { //IF this test starts failing that is a good thing, it means the robots impl now supports querystrings string userAgentString = _userAgentString; _unitUnderTest = new RobotsDotText(_rootUri, @"User-Agent: * Disallow: /?category=whatever Disallow: /?category=another&color=red"); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "?category=whatever", userAgentString)); Assert.IsTrue(_unitUnderTest.IsUrlAllowed(_rootUri.AbsoluteUri + "?category=another&blah=blah", userAgentString)); }
/// <summary> /// Parses the robots dot text. /// </summary> /// <param name = "baseUri">The base URI.</param> /// <param name = "robotsDotTextSource">The robots dot text source.</param> /// <returns></returns> public override RobotsDotText ParseRobotsDotTextSource(Uri baseUri, byte[] robotsDotTextSource) { RobotsDotText robotsDotText = new RobotsDotText(); robotsDotText.DisallowedPaths = new List <string>(); if (robotsDotTextSource != null) { using (StreamReader streamReader = new StreamReader(new MemoryStream(robotsDotTextSource))) { string currentUserAgent = string.Empty; bool addToDisallowedPaths = false; while (!streamReader.EndOfStream) { string originalLine = streamReader.ReadLine(); string lineForSyntaxEvaluation = originalLine.ToLowerInvariant().Trim(); if (lineForSyntaxEvaluation.StartsWith("#") || string.IsNullOrEmpty(lineForSyntaxEvaluation)) { continue; } if (lineForSyntaxEvaluation.StartsWith("crawl-delay:")) { if (currentUserAgent.Replace("user-agent:", string.Empty).Trim() == "*" || currentUserAgent.Contains(ApplicationSettings.UserAgent.ToLowerInvariant())) { lineForSyntaxEvaluation = lineForSyntaxEvaluation.Replace("crawl-delay:", string.Empty); int crawlDelay; if (int.TryParse(lineForSyntaxEvaluation, out crawlDelay)) { robotsDotText.CrawlDelay = crawlDelay; } } continue; } if (lineForSyntaxEvaluation.StartsWith("user-agent:")) { if (lineForSyntaxEvaluation.Replace("user-agent:", string.Empty).Trim() == "*" || lineForSyntaxEvaluation.Contains(ApplicationSettings.UserAgent.ToLowerInvariant())) { currentUserAgent = lineForSyntaxEvaluation.Replace("user-agent:", string.Empty).Trim(); addToDisallowedPaths = true; } else { currentUserAgent = string.Empty; addToDisallowedPaths = false; } continue; } if (addToDisallowedPaths) { if (lineForSyntaxEvaluation.StartsWith("disallow:")) { lineForSyntaxEvaluation = Regex.Replace(originalLine, "disallow:", string.Empty, RegexOptions.IgnoreCase).Trim(); if (!string.IsNullOrEmpty(lineForSyntaxEvaluation)) { Uri uri; if (Uri.TryCreate(baseUri, lineForSyntaxEvaluation, out uri)) { if (!robotsDotText.DisallowedPaths.Contains(uri.AbsoluteUri)) { robotsDotText.DisallowedPaths.Add(uri.AbsoluteUri); } } } } } } } } return(robotsDotText); }
public void Constructor_NullContent() { string nullContent = null; _unitUnderTest = new RobotsDotText(_rootUri, nullContent); }
public void Constructor_NullRootUri() { _unitUnderTest = new RobotsDotText(null, _robotsContent); }
public void SetUp() { _unitUnderTest = new RobotsDotText(_rootUri, _robotsContent); }