public void IsPathAllowedStarWildcard(string rule, string path, bool result) { string s = @"User-agent: *" + this.newLine + "Disallow: " + rule; Robots r = Robots.Load(s); r.IsPathAllowed("*", path); Assert.AreEqual(result, r.IsPathAllowed("*", path)); }
public void IsPathAllowed_StarWildcard(string rule, string path, Boolean result) { string s = @"User-agent: *" + nl + "Disallow: " + rule; Robots r = Robots.Load(s); Assert.Equal(result, r.IsPathAllowed("*", path)); }
/// <summary> /// Crawls a page. /// </summary> /// <param name="url">The url to crawl.</param> private void CrawlPage(string url) { // clean up the url a bit url = StandardizeUrl(url); try { if (!PageHasBeenCrawled(url) && _robotHelper.IsPathAllowed(_userAgent, url) && url.StartsWith(_baseUrl)) { string rawPage = GetWebText(url); if (!string.IsNullOrWhiteSpace(rawPage)) { var htmlDoc = new HtmlDocument(); htmlDoc.LoadHtml(rawPage); // ensure the page should be indexed by looking at the robot and rock conventions HtmlNode metaRobot = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='robot']"); if (metaRobot == null || metaRobot.Attributes["content"] == null || !metaRobot.Attributes["content"].Value.Contains("noindex")) { _previouslyCrawledPages.Add(url); // index the page SitePageIndex sitePage = new SitePageIndex(); sitePage.Content = GetPageText(htmlDoc); sitePage.Url = url; sitePage.Id = url.MakeInt64HashCode(); sitePage.SourceIndexModel = "Rock.Model.Site"; sitePage.PageTitle = GetPageTitle(htmlDoc, url); sitePage.DocumentName = sitePage.PageTitle; sitePage.SiteName = _site.Name; sitePage.SiteId = _site.Id; sitePage.LastIndexedDateTime = RockDateTime.Now; HtmlNode metaDescription = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']"); if (metaDescription != null && metaDescription.Attributes["content"] != null) { sitePage.PageSummary = metaDescription.Attributes["content"].Value; } HtmlNode metaKeynotes = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='keywords']"); if (metaKeynotes != null && metaKeynotes.Attributes["content"] != null) { sitePage.PageKeywords = metaKeynotes.Attributes["content"].Value; } IndexContainer.IndexDocument(sitePage); // crawl all the links found on the page. foreach (string link in ParseLinks(htmlDoc)) { CrawlPage(link); } } } } } catch { } }
public bool IsPathAllowed_StarWildcard(string rule, string path) { string s = @"User-agent: *" + nl + "Disallow: " + rule; Robots r = Robots.Load(s); return(r.IsPathAllowed("*", path)); }
public void IsPathAllowedRuleWithoutUserAgentTrue() { string s = "Disallow: /"; Robots r = Robots.Load(s); Assert.True(r.IsPathAllowed("*", "/foo")); }
public void IsPathAllowed_DollarWildcard_False( string path) { string s = @"User-agent: *" + nl + "Disallow: /*.gif$"; Robots r = Robots.Load(s); Assert.False(r.IsPathAllowed("*", path)); }
public void IsPathAllowed_AllowAndDisallow_False( string path) { string s = @"User-agent: *" + nl + "Allow: /dir/file.ext" + nl + "Disallow: /dir/"; Robots r = new Robots(s); Assert.False(r.IsPathAllowed("*", path)); }
public void IsPathAllowed_OnlyDisallow_False( string path) { string s = @"User-agent: *" + nl + "Disallow: /help"; Robots r = new Robots(s); Assert.False(r.IsPathAllowed("*", path)); }
public void IsPathAllowed_WithoutRules_True( string userAgent, string path) { Robots r = new Robots(String.Empty); Assert.True(r.IsPathAllowed(userAgent, path)); }
public void IsPathAllowed_DollarWildcard_True( [Values("asd", "a.gifa", "a.gif$")] string path) { string s = @"User-agent: *" + nl + "Disallow: /*.gif$"; Robots r = Robots.Load(s); Assert.True(r.IsPathAllowed("*", path)); }
public void Robots_InvalidUserAgent_Malformed() { string s = "User-agent: " + nl + "Disallow: /file.html"; Robots r = new Robots(s); Assert.True(r.Malformed); Assert.True(r.IsPathAllowed("myRobot", "/file.html")); }
public void IsPathAllowed_NoRulesForRobot_True( string path) { string s = "User-agent: Slurp" + nl + "Disallow: /"; Robots r = new Robots(s); Assert.True(r.IsPathAllowed("some robot", path)); }
public void IsPathAllowedNoRulesForRobotTrue( [Values("", "/", "/file.html", "/directory/")] string path) { string s = "User-agent: Slurp" + this.newLine + "Disallow: /"; Robots r = new Robots(s); Assert.True(r.IsPathAllowed("some robot", path)); }
public void IsPathAllowedWithoutRulesTrue( [Values("*", "some robot")] string userAgent, [Values("", "/", "/file.html", "/directory/")] string path) { Robots r = new Robots(String.Empty); Assert.True(r.IsPathAllowed(userAgent, path)); }
public void IsPathAllowed_OnlyDisallow_False( [Values("/help", "/help.ext", "/help/", "/help/file.ext", "/help/dir/", "/help/dir/file.ext")] string path) { string s = @"User-agent: *" + this.newLine + "Disallow: /help"; Robots r = new Robots(s); Assert.False(r.IsPathAllowed("*", path)); }
public void IsPathAllowed_AllowAndDisallow_True( [Values("foo", "/dir/file.ext", "/dir/file.ext1")] string path) { string s = @"User-agent: *" + nl + "Allow: /dir/file.ext" + nl + "Disallow: /dir/"; Robots r = new Robots(s); Assert.True(r.IsPathAllowed("*", path)); }
public void IsPathAllowedAllowAndDisallowFalse( [Values("/dir/file2.ext", "/dir/", "/dir/dir/")] string path) { string s = @"User-agent: *" + this.newLine + "Allow: /dir/file.ext" + this.newLine + "Disallow: /dir/"; Robots r = new Robots(s); Assert.False(r.IsPathAllowed("*", path)); }
public void IsPathAllowedDollarWildcardFalse( [Values("a.gif", "foo.gif", "b.a.gif", "a.gif.gif")] string path) { string s = @"User-agent: *" + this.newLine + "Disallow: /*.gif$"; Robots r = Robots.Load(s); Assert.False(r.IsPathAllowed("*", path)); }
public void IsPathAllowedWithoutAccessRuleTrue( [Values("*", "some robot")] string userAgent, [Values("", "/", "/file.html", "/directory/")] string path) { string s = "User-agent: *" + this.newLine + "Crawl-delay: 5"; Robots r = new Robots(s); Assert.True(r.IsPathAllowed(userAgent, path)); }
public void IsPathAllowed_NoGlobalRules_False( string userAgent, string path) { string s = "User-agent: Slurp" + nl + "Disallow: /" + nl + "User-agent: Exabot" + nl + "Disallow: /"; Robots r = new Robots(s); Assert.False(r.IsPathAllowed(userAgent, path)); }
public void IsPathAllowedPathShouldBeCaseSensitiveTrue( [Values("/dir/file.ext", "/dir/file.ext", "/*/file.html", "/*.gif$")] string rule, [Values("/dir/File.ext", "/Dir/file.ext", "/a/File.html", "a.GIF")] string path) { string s = @"User-agent: *" + this.newLine + "Disallow: " + rule; Robots r = Robots.Load(s); Assert.True(r.IsPathAllowed("*", path)); }
public void IsPathAllowedEmptyUserAgentThrowsArgumentException( [Values("", " ")] string userAgent, // white space considered empty [Values("")] string path) { string s = "User-agent: *" + this.newLine + "Disallow: /"; Robots r = new Robots(s); Assert.Throws <ArgumentException>(() => r.IsPathAllowed(userAgent, path)); }
public void IsPathAllowed_EmptyUserAgent_ThrowsArgumentException( string userAgent, // white space considered empty string path) { string s = "User-agent: *" + nl + "Disallow: /"; Robots r = new Robots(s); Assert.Throws <ArgumentException>(() => r.IsPathAllowed(userAgent, path)); }
public void IsPathAllowed_PathShouldBeCaseSensitive_True( string rule, string path) { string s = @"User-agent: *" + nl + "Disallow: " + rule; Robots r = Robots.Load(s); Assert.True(r.IsPathAllowed("*", path)); }
//Можно ли индексировать сайт public bool robotsTxt(String url) { //String urlBase = UrlClass.getBaseUrl(url); var uri = new Uri(url); bool canIGoThere = robots.IsPathAllowed("*", uri.PathAndQuery); return(canIGoThere); }
public void IsPathAllowedNoGlobalRulesFalse( [Values("Slurp", "slurp", "Exabot", "exabot")] string userAgent, [Values("", "/", "/file.html", "/directory/")] string path) { string s = "User-agent: Slurp" + this.newLine + "Disallow: /" + this.newLine + "User-agent: Exabot" + this.newLine + "Disallow: /"; Robots r = new Robots(s); Assert.False(r.IsPathAllowed(userAgent, path)); }
public void IsPathAllowed_WithoutAccessRule_True( string userAgent, string path) { string s = "User-agent: *" + nl + "Crawl-delay: 5"; Robots r = new Robots(s); Assert.True(r.IsPathAllowed(userAgent, path)); }
public void IsPathAllowed_UserAgentStringCaseInsensitive_False( string userAgent) { string s = @"User-agent: Slurp Disallow: / User-agent: Exabot Disallow: / User-agent: Exabot Disallow: / User-agent: figtree Disallow: /"; Robots r = Robots.Load(s); Assert.False(r.IsPathAllowed(userAgent, "/dir")); }
public void IsPathAllowedUserAgentStringCaseInsensitiveFalse( [Values("Slurp", "slurp", "Exabot", "exabot", "FigTree/0.1 Robot libwww-perl/5.04")] string userAgent) { string s = @"User-agent: Slurp Disallow: / User-agent: Exabot Disallow: / User-agent: Exabot Disallow: / User-agent: figtree Disallow: /"; Robots r = Robots.Load(s); Assert.False(r.IsPathAllowed(userAgent, "/dir")); }
public void ProcessLinks(IEnumerable <string> links) { foreach (var link in links.Where(l => !string.IsNullOrEmpty(l)).Distinct()) { var clearedLink = ClearLink(link); if (!_robots.IsPathAllowed(Crawler.USER_AGENT, clearedLink)) { continue; } if (UrlHelper.GetDomain(UrlHelper.CreatUri(clearedLink)) == _masterDomain && !ProcessedList.Contains(clearedLink) && !ToProcessList.Contains(clearedLink)) { ToProcessList.Push(clearedLink); } } }