public void TimerRemovalTest() { // test if timer correctly removes old entries // general test string robotsTxt = @" User-agent: * Disallow: /"; var timerInterval = TimeSpan.FromSeconds(2); var config = new WorkerConfiguration() { UserAgent = "CryCrawler", RespectRobotsExclusionStandard = true }; var robots = new RobotsHandler(config, new System.Net.Http.HttpClient(), timerInterval); var data = robots.RegisterRobotsTxt("test.com", robotsTxt).Result; var e = robots.IsUrlExcluded("http://test.com/something").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/lol/test").Result; Assert.True(e); Task.Delay((int)timerInterval.TotalMilliseconds + 100).Wait(); e = robots.IsUrlExcluded("http://test.com/something").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/lol/test").Result; Assert.False(e); }
public void GeneralTest2() { // general test string robotsTxt = @" User-agent: CryCrawler Disallow: / User-agent: * Disallow: /admin Disallow: /advertisements Disallow: /artists Disallow: /artist_commentaries Disallow: /artist_commentary_versions Disallow: /artist_versions Disallow: /bans Disallow: /comment_votes Disallow: /comments Disallow: /counts Disallow: /delayed_jobs Disallow: /dmails Disallow: /favorite Disallow: /iqdb_queries Disallow: /ip_bans Disallow: /janitor_trials"; var config = new WorkerConfiguration() { UserAgent = "CryCrawler", RespectRobotsExclusionStandard = true }; var robots = new RobotsHandler(config, new System.Net.Http.HttpClient()); var data = robots.RegisterRobotsTxt("test.com", robotsTxt).Result; Assert.Empty(data.AllowedList); Assert.Single(data.DisallowedList); Assert.Equal(0, data.WaitTime); Assert.DoesNotContain(RobotsHandler.GetRegexPattern("/admin"), data.DisallowedList); Assert.DoesNotContain(RobotsHandler.GetRegexPattern("/bans"), data.DisallowedList); Assert.DoesNotContain(RobotsHandler.GetRegexPattern("/artist_commentary_versions"), data.DisallowedList); Assert.DoesNotContain(RobotsHandler.GetRegexPattern("/favorite"), data.DisallowedList); Assert.DoesNotContain(RobotsHandler.GetRegexPattern("/janitor_trials"), data.DisallowedList); var e = robots.IsUrlExcluded("http://test2.com/admin").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/admin").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/admin?url=test").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/admin/test/admin?url=test").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/test/admin").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/admin2").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com").Result; Assert.True(e); }
public void UserAgentMatchingTest() { // test if user agents get matched correctly string robotsTxt = @" User-agent: Googlebot Allow: /testdomain2 Allow: /advertisements Disallow: /artists User-agent: CryCrawler Disallow: /admin Allow: /*?lang= Disallow: /search/realtime User-agent: * Disallow: /testdomain "; var config = new WorkerConfiguration() { UserAgent = "CryCrawler", RespectRobotsExclusionStandard = true }; var robots = new RobotsHandler(config, new System.Net.Http.HttpClient()); var data = robots.RegisterRobotsTxt("test.com", robotsTxt).Result; Assert.Single(data.AllowedList); Assert.Equal(2, data.DisallowedList.Count); }
public void PriorityMatchingTest() { // test if more detailed rules have priority when matching patterns string robotsTxt = @" User-agent: Googlebot Allow: /testdomain2 Allow: /advertisements Disallow: /artists User-agent: CryCrawler Disallow: /admin Disallow: /search Allow: /search/test User-agent: * Disallow: /testdomain "; var config = new WorkerConfiguration() { UserAgent = "CryCrawler", RespectRobotsExclusionStandard = true }; var robots = new RobotsHandler(config, new System.Net.Http.HttpClient()); var data = robots.RegisterRobotsTxt("test.com", robotsTxt).Result; Assert.Single(data.AllowedList); Assert.Equal(2, data.DisallowedList.Count); var e = robots.IsUrlExcluded("http://test.com/admin").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/search").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/search/test").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/search/test/anothertest").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/search/test?url=23").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/search/test2").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/search/nottest").Result; Assert.True(e); }
public void AdvancedTest() { // general test string robotsTxt = @" User-agent: Googlebot Allow: /testdomain2 Allow: /advertisements Disallow: /artists Disallow: /artist_commentaries Disallow: /artist_commentary_versions Disallow: /artist_versions Disallow: /bans Disallow: /comment_votes Disallow: /comments Disallow: /counts Disallow: /delayed_jobs Disallow: /dmails Disallow: /favorite Disallow: /iqdb_queries Disallow: /ip_bans Disallow: /janitor_trials User-agent: CryCrawler Disallow: /admin Allow: /*?lang= Allow: /hashtag/*?src= Allow: /search?q=%23 Disallow: /search/realtime Disallow: /search/users Disallow: /search/*/grid Disallow: /hashtag Crawl-delay: 1 User-agent: * Disallow: / Crawl-delay: 4 "; var config = new WorkerConfiguration() { UserAgent = "CryCrawler", RespectRobotsExclusionStandard = true }; var robots = new RobotsHandler(config, new System.Net.Http.HttpClient()); var data = robots.RegisterRobotsTxt("test.com", robotsTxt).Result; Assert.Equal(3, data.AllowedList.Count); Assert.Equal(5, data.DisallowedList.Count); Assert.Equal(1, data.WaitTime); var e = robots.IsUrlExcluded("http://test.com/testdomain2").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/admin").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/search").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/search/realtime").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/search/users").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/search/usersss").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/search/something").Result; Assert.False(e); e = robots.IsUrlExcluded("http://test.com/search/something/grid").Result; Assert.True(e); e = robots.IsUrlExcluded("http://test.com/search/something/grid2").Result; Assert.False(e); }