public void IsPathAllowedStarWildcard(string rule, string path, bool result)
        {
            string s = @"User-agent: *" + this.newLine + "Disallow: " + rule;
            Robots r = Robots.Load(s);

            r.IsPathAllowed("*", path);
            Assert.AreEqual(result, r.IsPathAllowed("*", path));
        }
Пример #2
0
        public void IsPathAllowed_StarWildcard(string rule, string path, Boolean result)
        {
            string s = @"User-agent: *" + nl + "Disallow: " + rule;
            Robots r = Robots.Load(s);

            Assert.Equal(result, r.IsPathAllowed("*", path));
        }
Пример #3
0
        /// <summary>
        /// Crawls a page.
        /// </summary>
        /// <param name="url">The url to crawl.</param>
        private void CrawlPage(string url)
        {
            // clean up the url a bit
            url = StandardizeUrl(url);

            try
            {
                if (!PageHasBeenCrawled(url) && _robotHelper.IsPathAllowed(_userAgent, url) && url.StartsWith(_baseUrl))
                {
                    string rawPage = GetWebText(url);

                    if (!string.IsNullOrWhiteSpace(rawPage))
                    {
                        var htmlDoc = new HtmlDocument();
                        htmlDoc.LoadHtml(rawPage);

                        // ensure the page should be indexed by looking at the robot and rock conventions
                        HtmlNode metaRobot = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='robot']");
                        if (metaRobot == null || metaRobot.Attributes["content"] == null || !metaRobot.Attributes["content"].Value.Contains("noindex"))
                        {
                            _previouslyCrawledPages.Add(url);

                            // index the page
                            SitePageIndex sitePage = new SitePageIndex();

                            sitePage.Content             = GetPageText(htmlDoc);
                            sitePage.Url                 = url;
                            sitePage.Id                  = url.MakeInt64HashCode();
                            sitePage.SourceIndexModel    = "Rock.Model.Site";
                            sitePage.PageTitle           = GetPageTitle(htmlDoc, url);
                            sitePage.DocumentName        = sitePage.PageTitle;
                            sitePage.SiteName            = _site.Name;
                            sitePage.SiteId              = _site.Id;
                            sitePage.LastIndexedDateTime = RockDateTime.Now;

                            HtmlNode metaDescription = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='description']");
                            if (metaDescription != null && metaDescription.Attributes["content"] != null)
                            {
                                sitePage.PageSummary = metaDescription.Attributes["content"].Value;
                            }

                            HtmlNode metaKeynotes = htmlDoc.DocumentNode.SelectSingleNode("//meta[@name='keywords']");
                            if (metaKeynotes != null && metaKeynotes.Attributes["content"] != null)
                            {
                                sitePage.PageKeywords = metaKeynotes.Attributes["content"].Value;
                            }

                            IndexContainer.IndexDocument(sitePage);

                            // crawl all the links found on the page.
                            foreach (string link in ParseLinks(htmlDoc))
                            {
                                CrawlPage(link);
                            }
                        }
                    }
                }
            }
            catch { }
        }
Пример #4
0
        public bool IsPathAllowed_StarWildcard(string rule, string path)
        {
            string s = @"User-agent: *" + nl + "Disallow: " + rule;
            Robots r = Robots.Load(s);

            return(r.IsPathAllowed("*", path));
        }
        public void IsPathAllowedRuleWithoutUserAgentTrue()
        {
            string s = "Disallow: /";
            Robots r = Robots.Load(s);

            Assert.True(r.IsPathAllowed("*", "/foo"));
        }
Пример #6
0
        public void IsPathAllowed_DollarWildcard_False(
            string path)
        {
            string s = @"User-agent: *" + nl + "Disallow: /*.gif$";
            Robots r = Robots.Load(s);

            Assert.False(r.IsPathAllowed("*", path));
        }
Пример #7
0
        public void IsPathAllowed_AllowAndDisallow_False(
            string path)
        {
            string s = @"User-agent: *" + nl + "Allow: /dir/file.ext" + nl + "Disallow: /dir/";
            Robots r = new Robots(s);

            Assert.False(r.IsPathAllowed("*", path));
        }
Пример #8
0
        public void IsPathAllowed_OnlyDisallow_False(
            string path)
        {
            string s = @"User-agent: *" + nl + "Disallow: /help";
            Robots r = new Robots(s);

            Assert.False(r.IsPathAllowed("*", path));
        }
Пример #9
0
        public void IsPathAllowed_WithoutRules_True(
            string userAgent,
            string path)
        {
            Robots r = new Robots(String.Empty);

            Assert.True(r.IsPathAllowed(userAgent, path));
        }
Пример #10
0
        public void IsPathAllowed_DollarWildcard_True(
            [Values("asd", "a.gifa", "a.gif$")] string path)
        {
            string s = @"User-agent: *" + nl + "Disallow: /*.gif$";
            Robots r = Robots.Load(s);

            Assert.True(r.IsPathAllowed("*", path));
        }
Пример #11
0
        public void Robots_InvalidUserAgent_Malformed()
        {
            string s = "User-agent: " + nl + "Disallow: /file.html";
            Robots r = new Robots(s);

            Assert.True(r.Malformed);
            Assert.True(r.IsPathAllowed("myRobot", "/file.html"));
        }
Пример #12
0
        public void IsPathAllowed_NoRulesForRobot_True(
            string path)
        {
            string s = "User-agent: Slurp" + nl + "Disallow: /";
            Robots r = new Robots(s);

            Assert.True(r.IsPathAllowed("some robot", path));
        }
        public void IsPathAllowedNoRulesForRobotTrue(
            [Values("", "/", "/file.html", "/directory/")] string path)
        {
            string s = "User-agent: Slurp" + this.newLine + "Disallow: /";
            Robots r = new Robots(s);

            Assert.True(r.IsPathAllowed("some robot", path));
        }
        public void IsPathAllowedWithoutRulesTrue(
            [Values("*", "some robot")] string userAgent,
            [Values("", "/", "/file.html", "/directory/")] string path)
        {
            Robots r = new Robots(String.Empty);

            Assert.True(r.IsPathAllowed(userAgent, path));
        }
        public void IsPathAllowed_OnlyDisallow_False(
            [Values("/help", "/help.ext", "/help/", "/help/file.ext", "/help/dir/", "/help/dir/file.ext")] string path)
        {
            string s = @"User-agent: *" + this.newLine + "Disallow: /help";
            Robots r = new Robots(s);

            Assert.False(r.IsPathAllowed("*", path));
        }
Пример #16
0
        public void IsPathAllowed_AllowAndDisallow_True(
            [Values("foo", "/dir/file.ext", "/dir/file.ext1")] string path)
        {
            string s = @"User-agent: *" + nl + "Allow: /dir/file.ext" + nl + "Disallow: /dir/";
            Robots r = new Robots(s);

            Assert.True(r.IsPathAllowed("*", path));
        }
        public void IsPathAllowedAllowAndDisallowFalse(
            [Values("/dir/file2.ext", "/dir/", "/dir/dir/")] string path)
        {
            string s = @"User-agent: *" + this.newLine + "Allow: /dir/file.ext" + this.newLine + "Disallow: /dir/";
            Robots r = new Robots(s);

            Assert.False(r.IsPathAllowed("*", path));
        }
        public void IsPathAllowedDollarWildcardFalse(
            [Values("a.gif", "foo.gif", "b.a.gif", "a.gif.gif")] string path)
        {
            string s = @"User-agent: *" + this.newLine + "Disallow: /*.gif$";
            Robots r = Robots.Load(s);

            Assert.False(r.IsPathAllowed("*", path));
        }
        public void IsPathAllowedWithoutAccessRuleTrue(
            [Values("*", "some robot")] string userAgent,
            [Values("", "/", "/file.html", "/directory/")] string path)
        {
            string s = "User-agent: *" + this.newLine + "Crawl-delay: 5";
            Robots r = new Robots(s);

            Assert.True(r.IsPathAllowed(userAgent, path));
        }
Пример #20
0
        public void IsPathAllowed_NoGlobalRules_False(
            string userAgent,
            string path)
        {
            string s = "User-agent: Slurp" + nl + "Disallow: /" + nl + "User-agent: Exabot" + nl + "Disallow: /";
            Robots r = new Robots(s);

            Assert.False(r.IsPathAllowed(userAgent, path));
        }
        public void IsPathAllowedPathShouldBeCaseSensitiveTrue(
            [Values("/dir/file.ext", "/dir/file.ext", "/*/file.html", "/*.gif$")] string rule,
            [Values("/dir/File.ext", "/Dir/file.ext", "/a/File.html", "a.GIF")] string path)
        {
            string s = @"User-agent: *" + this.newLine + "Disallow: " + rule;
            Robots r = Robots.Load(s);

            Assert.True(r.IsPathAllowed("*", path));
        }
        public void IsPathAllowedEmptyUserAgentThrowsArgumentException(
            [Values("", " ")] string userAgent, // white space considered empty
            [Values("")] string path)
        {
            string s = "User-agent: *" + this.newLine + "Disallow: /";
            Robots r = new Robots(s);

            Assert.Throws <ArgumentException>(() => r.IsPathAllowed(userAgent, path));
        }
Пример #23
0
        public void IsPathAllowed_EmptyUserAgent_ThrowsArgumentException(
            string userAgent, // white space considered empty
            string path)
        {
            string s = "User-agent: *" + nl + "Disallow: /";
            Robots r = new Robots(s);

            Assert.Throws <ArgumentException>(() => r.IsPathAllowed(userAgent, path));
        }
Пример #24
0
        public void IsPathAllowed_PathShouldBeCaseSensitive_True(
            string rule,
            string path)
        {
            string s = @"User-agent: *" + nl + "Disallow: " + rule;
            Robots r = Robots.Load(s);

            Assert.True(r.IsPathAllowed("*", path));
        }
Пример #25
0
        //Можно ли индексировать сайт
        public bool robotsTxt(String url)
        {
            //String urlBase = UrlClass.getBaseUrl(url);

            var  uri         = new Uri(url);
            bool canIGoThere = robots.IsPathAllowed("*", uri.PathAndQuery);

            return(canIGoThere);
        }
        public void IsPathAllowedNoGlobalRulesFalse(
            [Values("Slurp", "slurp", "Exabot", "exabot")] string userAgent,
            [Values("", "/", "/file.html", "/directory/")] string path)
        {
            string s = "User-agent: Slurp" + this.newLine + "Disallow: /" + this.newLine + "User-agent: Exabot" + this.newLine + "Disallow: /";
            Robots r = new Robots(s);

            Assert.False(r.IsPathAllowed(userAgent, path));
        }
Пример #27
0
        public void IsPathAllowed_WithoutAccessRule_True(
            string userAgent,
            string path)
        {
            string s = "User-agent: *" + nl + "Crawl-delay: 5";
            Robots r = new Robots(s);

            Assert.True(r.IsPathAllowed(userAgent, path));
        }
Пример #28
0
        public void IsPathAllowed_UserAgentStringCaseInsensitive_False(
            string userAgent)
        {
            string s =
                @"User-agent: Slurp
Disallow: /
User-agent: Exabot
Disallow: /
User-agent: Exabot
Disallow: /
User-agent: figtree
Disallow: /";
            Robots r = Robots.Load(s);

            Assert.False(r.IsPathAllowed(userAgent, "/dir"));
        }
        public void IsPathAllowedUserAgentStringCaseInsensitiveFalse(
            [Values("Slurp", "slurp", "Exabot", "exabot", "FigTree/0.1 Robot libwww-perl/5.04")] string userAgent)
        {
            string s =
                @"User-agent: Slurp
Disallow: /
User-agent: Exabot
Disallow: /
User-agent: Exabot
Disallow: /
User-agent: figtree
Disallow: /";
            Robots r = Robots.Load(s);

            Assert.False(r.IsPathAllowed(userAgent, "/dir"));
        }
Пример #30
0
 public void ProcessLinks(IEnumerable <string> links)
 {
     foreach (var link in links.Where(l => !string.IsNullOrEmpty(l)).Distinct())
     {
         var clearedLink = ClearLink(link);
         if (!_robots.IsPathAllowed(Crawler.USER_AGENT, clearedLink))
         {
             continue;
         }
         if (UrlHelper.GetDomain(UrlHelper.CreatUri(clearedLink)) == _masterDomain &&
             !ProcessedList.Contains(clearedLink) &&
             !ToProcessList.Contains(clearedLink))
         {
             ToProcessList.Push(clearedLink);
         }
     }
 }