public void TimerRemovalTest() { // test if timer correctly removes old entries // general test string robotsTxt = @" User-agent: * Disallow: /"; var timerInterval = TimeSpan.FromSeconds(2); var config = new WorkerConfiguration() { UserAgent = "CryCrawler", RespectRobotsExclusionStandard = true }; var robots = new RobotsHandler(config, new System.Net.Http.HttpClient(), timerInterval); var data = robots.RegisterRobotsTxt("test.com", robotsTxt).Result; var e = robots.IsUrlExcluded("http://test.com/something").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/lol/test").Result; Assert.True(e); Task.Delay((int)timerInterval.TotalMilliseconds + 100).Wait(); e = robots.IsUrlExcluded("http://test.com/something").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/lol/test").Result; Assert.False(e); }
public void GeneralTest2() { // general test string robotsTxt = @" User-agent: CryCrawler Disallow: / User-agent: * Disallow: /admin Disallow: /advertisements Disallow: /artists Disallow: /artist_commentaries Disallow: /artist_commentary_versions Disallow: /artist_versions Disallow: /bans Disallow: /comment_votes Disallow: /comments Disallow: /counts Disallow: /delayed_jobs Disallow: /dmails Disallow: /favorite Disallow: /iqdb_queries Disallow: /ip_bans Disallow: /janitor_trials"; var config = new WorkerConfiguration() { UserAgent = "CryCrawler", RespectRobotsExclusionStandard = true }; var robots = new RobotsHandler(config, new System.Net.Http.HttpClient()); var data = robots.RegisterRobotsTxt("test.com", robotsTxt).Result; Assert.Empty(data.AllowedList); Assert.Single(data.DisallowedList); Assert.Equal(0, data.WaitTime); Assert.DoesNotContain(RobotsHandler.GetRegexPattern("/admin"), data.DisallowedList); Assert.DoesNotContain(RobotsHandler.GetRegexPattern("/bans"), data.DisallowedList); Assert.DoesNotContain(RobotsHandler.GetRegexPattern("/artist_commentary_versions"), data.DisallowedList); Assert.DoesNotContain(RobotsHandler.GetRegexPattern("/favorite"), data.DisallowedList); Assert.DoesNotContain(RobotsHandler.GetRegexPattern("/janitor_trials"), data.DisallowedList); var e = robots.IsUrlExcluded("http://test2.com/admin").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/admin").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/admin?url=test").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/admin/test/admin?url=test").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/test/admin").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/admin2").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com").Result; Assert.True(e); }
public void UserAgentMatchingTest() { // test if user agents get matched correctly string robotsTxt = @" User-agent: Googlebot Allow: /testdomain2 Allow: /advertisements Disallow: /artists User-agent: CryCrawler Disallow: /admin Allow: /*?lang= Disallow: /search/realtime User-agent: * Disallow: /testdomain "; var config = new WorkerConfiguration() { UserAgent = "CryCrawler", RespectRobotsExclusionStandard = true }; var robots = new RobotsHandler(config, new System.Net.Http.HttpClient()); var data = robots.RegisterRobotsTxt("test.com", robotsTxt).Result; Assert.Single(data.AllowedList); Assert.Equal(2, data.DisallowedList.Count); }
public void RegexMatchingTest() { // test if regex matching works as expected string p1 = "Test.com", p2 = "*", p3 = "Cry*Crawler", p4 = "Cry*Craw*ler"; var r1 = RobotsHandler.GetRegexPattern(p1); var r2 = RobotsHandler.GetRegexPattern(p2); var r3 = RobotsHandler.GetRegexPattern(p3); var r4 = RobotsHandler.GetRegexPattern(p4); var m = Regex.IsMatch("test.com", r1, RegexOptions.IgnoreCase); Assert.True(m); m = Regex.IsMatch("test.com", r1); Assert.False(m); m = Regex.IsMatch("Test.com", r1); Assert.True(m); m = Regex.IsMatch("test.com", r2, RegexOptions.IgnoreCase); Assert.True(m); m = Regex.IsMatch("test.com", r2); Assert.True(m); m = Regex.IsMatch("", r2); Assert.True(m); m = Regex.IsMatch("test.com", r3); Assert.False(m); m = Regex.IsMatch("CryCrawler", r3); Assert.True(m); m = Regex.IsMatch("ACryCrawler", r3); Assert.False(m); m = Regex.IsMatch("CryACrawler", r3); Assert.True(m); m = Regex.IsMatch("cryCrawler", r3); Assert.False(m); m = Regex.IsMatch("cryCrawler", r3, RegexOptions.IgnoreCase); Assert.True(m); m = Regex.IsMatch("CryCrawADler", r3); Assert.False(m); m = Regex.IsMatch("CryCrawADler", r4); Assert.True(m); m = Regex.IsMatch("CryCrawler", r4); Assert.True(m); }
private RobotsHandler GetRobotsHandler(string host) { RobotsHandler result; if (_robotsHandlers.TryGetValue(host, out result)) { return(result); } var uri = new Uri("http://" + host + "/robots.txt", UriKind.Absolute); var robotsTxt = _client.GetStringAsync(uri).Result; result = new RobotsHandler(robotsTxt); _robotsHandlers.Add(host, result); return(result); }
public void PriorityMatchingTest() { // test if more detailed rules have priority when matching patterns string robotsTxt = @" User-agent: Googlebot Allow: /testdomain2 Allow: /advertisements Disallow: /artists User-agent: CryCrawler Disallow: /admin Disallow: /search Allow: /search/test User-agent: * Disallow: /testdomain "; var config = new WorkerConfiguration() { UserAgent = "CryCrawler", RespectRobotsExclusionStandard = true }; var robots = new RobotsHandler(config, new System.Net.Http.HttpClient()); var data = robots.RegisterRobotsTxt("test.com", robotsTxt).Result; Assert.Single(data.AllowedList); Assert.Equal(2, data.DisallowedList.Count); var e = robots.IsUrlExcluded("http://test.com/admin").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/search").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/search/test").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/search/test/anothertest").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/search/test?url=23").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/search/test2").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/search/nottest").Result; Assert.True(e); }
public void AdvancedTest() { // general test string robotsTxt = @" User-agent: Googlebot Allow: /testdomain2 Allow: /advertisements Disallow: /artists Disallow: /artist_commentaries Disallow: /artist_commentary_versions Disallow: /artist_versions Disallow: /bans Disallow: /comment_votes Disallow: /comments Disallow: /counts Disallow: /delayed_jobs Disallow: /dmails Disallow: /favorite Disallow: /iqdb_queries Disallow: /ip_bans Disallow: /janitor_trials User-agent: CryCrawler Disallow: /admin Allow: /*?lang= Allow: /hashtag/*?src= Allow: /search?q=%23 Disallow: /search/realtime Disallow: /search/users Disallow: /search/*/grid Disallow: /hashtag Crawl-delay: 1 User-agent: * Disallow: / Crawl-delay: 4 "; var config = new WorkerConfiguration() { UserAgent = "CryCrawler", RespectRobotsExclusionStandard = true }; var robots = new RobotsHandler(config, new System.Net.Http.HttpClient()); var data = robots.RegisterRobotsTxt("test.com", robotsTxt).Result; Assert.Equal(3, data.AllowedList.Count); Assert.Equal(5, data.DisallowedList.Count); Assert.Equal(1, data.WaitTime); var e = robots.IsUrlExcluded("http://test.com/testdomain2").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/admin").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/search").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/search/realtime").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/search/users").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/search/usersss").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/search/something").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/search/something/grid").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/search/something/grid2").Result; Assert.False(e); }
public void Init() { _handler = new RobotsHandler(Sample); }