Пример #1
0
        public void IsPathAllowed_StarWildcard(string rule, string path, Boolean result)
        {
            string s = @"User-agent: *" + nl + "Disallow: " + rule;
            Robots r = Robots.Load(s);

            Assert.Equal(result, r.IsPathAllowed("*", path));
        }
        public void Then_disallows_should_be_respected()
        {
            const string robotsTxt = "RobotsFixtures\\example2.txt";

            Robots.Load(File.ReadAllText(robotsTxt));
            var lines = new Queue <string>(File.ReadAllLines(robotsTxt));

            while (lines.Peek() != "User-agent: *")
            {
                lines.Dequeue();
            }
            lines.Dequeue();
            while (!lines.Peek().StartsWith("#"))
            {
                var line = lines.Dequeue();
                if (string.IsNullOrEmpty(line))
                {
                    continue;
                }
                var rule = line.Split(' ')[1].Replace("*", "foo");
                Robots.PathIsAllowed(rule).Should().BeFalse();
            }

            Robots.PathIsAllowed("/path").Should().BeTrue();
        }
Пример #3
0
        public void TestFixtureSetUp()
        {
            var path   = Path.Combine(Environment.CurrentDirectory, "RobotsFixtures\\robots.txt");
            var robots = File.OpenText(path).ReadToEnd();

            Robots.Load(robots, "bot");
        }
        public void IsPathAllowedRuleWithoutUserAgentTrue()
        {
            string s = "Disallow: /";
            Robots r = Robots.Load(s);

            Assert.True(r.IsPathAllowed("*", "/foo"));
        }
Пример #5
0
        public bool IsPathAllowed_StarWildcard(string rule, string path)
        {
            string s = @"User-agent: *" + nl + "Disallow: " + rule;
            Robots r = Robots.Load(s);

            return(r.IsPathAllowed("*", path));
        }
Пример #6
0
        public void IsPathAllowed_DollarWildcard_False(
            string path)
        {
            string s = @"User-agent: *" + nl + "Disallow: /*.gif$";
            Robots r = Robots.Load(s);

            Assert.False(r.IsPathAllowed("*", path));
        }
Пример #7
0
        public void CrawlDelayRuleWithoutUserAgent()
        {
            string s = "Crawl-delay: 1";
            Robots r = Robots.Load(s);

            Assert.AreNotEqual(1000, r.CrawlDelay("Google"));
            Assert.AreEqual(0, r.CrawlDelay("Google"));
        }
Пример #8
0
        public void IsPathAllowed_DollarWildcard_True(
            [Values("asd", "a.gifa", "a.gif$")] string path)
        {
            string s = @"User-agent: *" + nl + "Disallow: /*.gif$";
            Robots r = Robots.Load(s);

            Assert.True(r.IsPathAllowed("*", path));
        }
        public void IsPathAllowedDollarWildcardFalse(
            [Values("a.gif", "foo.gif", "b.a.gif", "a.gif.gif")] string path)
        {
            string s = @"User-agent: *" + this.newLine + "Disallow: /*.gif$";
            Robots r = Robots.Load(s);

            Assert.False(r.IsPathAllowed("*", path));
        }
        public void IsPathAllowedStarWildcard(string rule, string path, bool result)
        {
            string s = @"User-agent: *" + this.newLine + "Disallow: " + rule;
            Robots r = Robots.Load(s);

            r.IsPathAllowed("*", path);
            Assert.AreEqual(result, r.IsPathAllowed("*", path));
        }
Пример #11
0
        public void IsPathAllowed_PathShouldBeCaseSensitive_True(
            string rule,
            string path)
        {
            string s = @"User-agent: *" + nl + "Disallow: " + rule;
            Robots r = Robots.Load(s);

            Assert.True(r.IsPathAllowed("*", path));
        }
        public void IsPathAllowedPathShouldBeCaseSensitiveTrue(
            [Values("/dir/file.ext", "/dir/file.ext", "/*/file.html", "/*.gif$")] string rule,
            [Values("/dir/File.ext", "/Dir/file.ext", "/a/File.html", "a.GIF")] string path)
        {
            string s = @"User-agent: *" + this.newLine + "Disallow: " + rule;
            Robots r = Robots.Load(s);

            Assert.True(r.IsPathAllowed("*", path));
        }
Пример #13
0
        public bool CanIGoThere(UrlItem url, out long crawlDelay)
        {
            crawlDelay = 0;

            if (!RobotEnabled)
            {
                return(true);
            }

            if (!_allowedUrl.GetOrAdd(url.Url, false))
            {
                var content = _robots.GetOrAdd(url.Host, default(Robots));
                if (content == default)
                {
                    try
                    {
                        var response = WebRequest.Create("https://" + url.Host + "/robots.txt").GetResponse() as HttpWebResponse;
                        switch (response.StatusCode)
                        {
                        case HttpStatusCode.OK:
                            using (Stream dataStream = response.GetResponseStream())
                            {
                                StreamReader reader             = new StreamReader(dataStream);
                                string       responseFromServer = reader.ReadToEnd();
                                Robots       robots             = Robots.Load(responseFromServer);
                                _robots.AddOrUpdate(url.Host, robots, (k, v) => robots);
                            }
                            break;

                        case HttpStatusCode.NotFound:
                        default:
                            Console.WriteLine($"Robots file cannot be read for {url.Host}");
                            _allowedUrl.AddOrUpdate(url.Url, true, (k, v) => true);
                            return(true);
                        }
                    }
                    catch (Exception ex)
                    {
                        Console.WriteLine($"Robots file cannot be read for {url.Host}, Reason {ex.Message}");
                        return(true);
                    }
                }
                crawlDelay = _robots[url.Host].CrawlDelay("*");
                var s = url.Url.Split(new string[] { url.Host }, StringSplitOptions.None);
                if (s.Length > 1 && !string.IsNullOrWhiteSpace(s[1]))
                {
                    return(_robots[url.Host].IsPathAllowed("*", s[1]));
                }
                else
                {
                    return(_robots[url.Host].IsPathAllowed("*", url.Url));
                }
            }
            return(true);
        }
Пример #14
0
 public void Setup()
 {
     _robots        = Robots.Load(string.Empty);
     _processedList = new List <string>
     {
         "http://gocardless.com/test2"
     };
     _toProcessList = new Stack <string>();
     _masterDomain  = "gocardless.com";
     _absolutePath  = "http://gocardless.com";
     _linkProcessor = new LinkProcessor(_robots, _processedList, _toProcessList, _masterDomain, _absolutePath);
 }
Пример #15
0
        /// <summary>
        /// Crawls a site.
        /// </summary>
        /// <param name="site">The site.</param>
        /// <param name="loginId">The login identifier.</param>
        /// <param name="password">The password.</param>
        /// <returns></returns>
        public int CrawlSite(Site site, string loginId, string password)
        {
            // Delete the indicies for the site that is being indexed.
            IndexContainer.DeleteDocumentByProperty(typeof(SitePageIndex), "SiteId", site.Id);

            _site = site;

            _startUrl = _site.IndexStartingLocation;
            var startingUri = new Uri(_startUrl);

            _baseUrl = startingUri.Scheme + "://" + startingUri.Authority;
            var baseUri = new Uri(_baseUrl);

            // get the robot helper class up and running
            var robotsUri = new Uri(baseUri, "robots.txt");
            var robotsTxt = GetWebText(robotsUri);

            _robotHelper = Robots.Load(robotsTxt);

            _cookieContainer = new CookieContainer();

            // If a loginId and password were included, get an authentication cookie
            if (loginId.IsNotNullOrWhiteSpace() && password.IsNotNullOrWhiteSpace())
            {
                var loginParam = new LoginParameters();
                loginParam.Username  = loginId;
                loginParam.Password  = password;
                loginParam.Persisted = false;

                var authUri    = new Uri(baseUri, "api/Auth/Login");
                var restClient = new RestClient(authUri);
                restClient.CookieContainer = _cookieContainer;

                var request = new RestRequest(Method.POST);
                request.RequestFormat = DataFormat.Json;
                request.AddBody(loginParam);

                var response = restClient.Execute(request);
            }

            _urlQueue.Enqueue(_site.IndexStartingLocation);
            while (_urlQueue.Any())
            {
                string url = _urlQueue.Dequeue().Replace("?", "\\?");
                CrawlPage(url);
            }

            return(_previouslyCrawledPages.Count);
        }
Пример #16
0
        public void IsPathAllowed_UserAgentStringCaseInsensitive_False(
            string userAgent)
        {
            string s =
                @"User-agent: Slurp
Disallow: /
User-agent: Exabot
Disallow: /
User-agent: Exabot
Disallow: /
User-agent: figtree
Disallow: /";
            Robots r = Robots.Load(s);

            Assert.False(r.IsPathAllowed(userAgent, "/dir"));
        }
        public void IsPathAllowedUserAgentStringCaseInsensitiveFalse(
            [Values("Slurp", "slurp", "Exabot", "exabot", "FigTree/0.1 Robot libwww-perl/5.04")] string userAgent)
        {
            string s =
                @"User-agent: Slurp
Disallow: /
User-agent: Exabot
Disallow: /
User-agent: Exabot
Disallow: /
User-agent: figtree
Disallow: /";
            Robots r = Robots.Load(s);

            Assert.False(r.IsPathAllowed(userAgent, "/dir"));
        }
Пример #18
0
 private async Task <Robots> GetRobotTxt(Uri urlToCrawl)
 {
     try
     {
         using (HttpResponseMessage robotsTxt = await _httpClient.GetAsync(urlToCrawl.AbsoluteUri + "/robots.txt"))
         {
             if (robotsTxt.IsSuccessStatusCode)
             {
                 return(Robots.Load(await robotsTxt.Content.ReadAsStringAsync()));
             }
         }
     }
     catch (Exception e)
     {
         System.Console.Write("\nGetRobot: ");
         System.Console.Write(e.Message);
     }
     return(null);
 }
        public async Task <IReadOnlyCollection <Uri> > ExtractLinks(CrawlRequest crawlRequest, HttpContent content)
        {
            if (crawlRequest.Url.PathAndQuery.Equals("/robots.txt", StringComparison.OrdinalIgnoreCase) == false)
            {
                return(new Uri[0]);
            }

            var contentString = await content.ReadAsStringAsync();

            try
            {
                var robots = Robots.Load(contentString);
                return(robots.Sitemaps.Select(s => s.Url).ToList());
            }
            catch
            {
                return(new Uri[0]);
            }
        }
Пример #20
0
        public static RobotsTxtReport CheckRobotsTxt(string url, Options providedOptions)
        {
            var  robotsTxtHost = string.IsNullOrWhiteSpace(providedOptions.ServerHostName) ? url : providedOptions.ServerHostName;
            Uri  robotsTxtUri  = RobotsTxtUri(robotsTxtHost);
            bool robotsTxtExists;
            var  robotsContent = string.Empty;

            try
            {
                robotsContent = string.IsNullOrWhiteSpace(providedOptions.ServerHostName) ? Helpers.NetworkHelper.GetString(robotsTxtUri) : Helpers.NetworkHelper.GetString(robotsTxtUri, url);
                //something was downloaded, and we need to check, if it is not empty now
                robotsTxtExists = !string.IsNullOrWhiteSpace(robotsContent);
            }
            catch (Exception exception)
            {
                Console.WriteLine($"Exception received when trying to download {robotsTxtUri.ToString()}");
                Console.WriteLine($"Exception: {exception.Message}");
                Console.WriteLine($"Exception stracktrace {exception.StackTrace}");
                robotsTxtExists = false;
            }

            var robotsFile           = robotsTxtExists ? Robots.Load(robotsContent) : Robots.Load(string.Empty);
            var getCheckStatus       = GetCheckStatus(robotsFile, providedOptions.CrawlingDenied);
            var sitemapsIsAccessible = CheckSitemapIsAccessible(robotsFile);

            if (HaveSitemaps(robotsFile))
            {
                //if robots.txt check is OK, but sitemaps, defined in it is not accessible - we shall fail the check
                getCheckStatus = getCheckStatus ? sitemapsIsAccessible : getCheckStatus;
            }

            var report = new RobotsTxtReport
            {
                Url                  = url,
                CheckStatus          = getCheckStatus,
                RobotsTxtExists      = robotsTxtExists,
                SitemapsIsAccessible = sitemapsIsAccessible,
                Robots               = robotsFile
            };

            return(report);
        }
Пример #21
0
        /// <summary>
        /// Crawls a site.
        /// </summary>
        /// <param name="site">The site.</param>
        /// <param name="loginId">The login identifier.</param>
        /// <param name="password">The password.</param>
        /// <returns></returns>
        public int CrawlSite(Site site, string loginId, string password)
        {
            _site = site;

            // get the robot helper class up and running
            _robotHelper = Robots.Load(_site.IndexStartingLocation);

            _startUrl = _site.IndexStartingLocation;

            var startingUri = new Uri(_startUrl);

            _baseUrl = startingUri.Scheme + "://" + startingUri.Authority;

            _cookieContainer = new CookieContainer();

            // If a loginId and password were included, get an authentication cookie
            if (loginId.IsNotNullOrWhitespace() && password.IsNotNullOrWhitespace())
            {
                var loginParam = new LoginParameters();
                loginParam.Username  = loginId;
                loginParam.Password  = password;
                loginParam.Persisted = false;

                var baseUri    = new Uri(_baseUrl);
                var authUri    = new Uri(baseUri, "api/Auth/Login");
                var restClient = new RestClient(authUri);
                restClient.CookieContainer = _cookieContainer;

                var request = new RestRequest(Method.POST);
                request.RequestFormat = DataFormat.Json;
                request.AddBody(loginParam);

                var response = restClient.Execute(request);
            }

            CrawlPage(_site.IndexStartingLocation);

            return(_previouslyCrawledPages.Count);
        }
Пример #22
0
 public Robots ReadRobotsTxt(string path)
 {
     return(Robots.Load(_webClientService.DownloadString($"{path}/robots.txt")));
 }
 public void Then_PathIsAllowed_should_return_true()
 {
     Robots.Load(null);
     Robots.PathIsAllowed("").Should().BeTrue();
 }
        public void Then_no_exception_is_thrown()
        {
            Action action = () => Robots.Load(null);

            action.ShouldNotThrow();
        }
Пример #25
0
        public static TransformBlock <Page, Page> GetBlock(IServiceScopeFactory scopeFactory, BufferBlock <Page> frontier)
        {
            return(new TransformBlock <Page, Page>(async page => {
                if (page == null)
                {
                    return null;
                }

                try
                {
                    var domainRegex = new Regex(@"https?:\/\/(.+?)\/");
                    var domain = domainRegex.Match(page.Url.ToString()).Groups[1].Value;

                    var scope = scopeFactory.CreateScope();
                    var dbContext = (Models.DbContext)scope.ServiceProvider.GetService(typeof(Models.DbContext));
                    Site site;
                    lock (Crawler.lockObj)
                    {
                        site = dbContext.Site.Where(s => s.Domain == domain).FirstOrDefault();
                    }
                    if (site == null)
                    {
                        var client = new HttpClient();

                        HttpResponseMessage response = null;
                        try
                        {
                            response = await client.GetAsync("http://" + domain + "/robots.txt");
                        }
                        catch { }
                        string robotsContent = null, sitemapContent = null;
                        if (response?.IsSuccessStatusCode ?? false)
                        {
                            robotsContent = await response.Content.ReadAsStringAsync();
                            var r = Robots.Load(robotsContent);

                            if (r.Sitemaps.Count > 0)
                            {
                                response = await client.GetAsync(r.Sitemaps[0].Url);
                                if (response.IsSuccessStatusCode)
                                {
                                    sitemapContent = await response.Content.ReadAsStringAsync();
                                }
                            }
                        }

                        lock (Crawler.lockObj)
                        {
                            EntityEntry <Site> entityEntry = dbContext.Site.Add(new Site()
                            {
                                Domain = domain,
                                RobotsContent = robotsContent,
                                SitemapContent = sitemapContent
                            });
                            site = entityEntry.Entity;
                            dbContext.SaveChanges();
                        }

                        Log.Information("Site from entity: {0} {Id}", site.Domain, site.Id);

                        if (sitemapContent != null)
                        {
                            var sitemap = new SitemapParser().Parse(sitemapContent);

                            foreach (var item in sitemap.Items)
                            {
                                await Crawler.PostPage(item.Location, dbContext, frontier, null);
                            }
                        }
                    }
                    scope.Dispose();
                    page.SiteId = site.Id;
                }
                catch (Exception e)
                {
                    Log.Error(e, "Site loader exception");
                }
                return page;
            }));
        }
Пример #26
0
        public static ActionBlock <Page> GetBlock(IServiceScopeFactory scopeFactory, BufferBlock <Page> frontier)
        {
            return(new ActionBlock <Page>(async page =>
            {
                if (page == null)
                {
                    return;
                }

                Log.Information("Link scraper {0}", page.Url);
                // If page is not html
                if (page.document == null)
                {
                    return;
                }

                var redirectRegex = new Regex("(?:document\\.location|location\\.href)\\s?=\\s?(?:'|\")([^'\"])+(?:'|\")");
                var redirects = redirectRegex.Matches(page.HtmlContent).Select(m => m.Groups[1].Value);

                var links = page.document.QuerySelectorAll("a").Select(l => l.GetAttribute("href"));
                Log.Information("Link scraper found {0} links and {1} redirects", links.Count(), redirects.Count());

                var list = links.ToList();
                list.AddRange(redirects);

                lock (Crawler.lockObj) {
                    var scope = scopeFactory.CreateScope();
                    var dbContext = scope.ServiceProvider.GetService <DbContext>();
                    var site = dbContext.Site.Where(d => d.Id == page.SiteId).FirstOrDefault();
                    // Log.Information("Site: {0} for page: {1}", site.Domain, page.Url);
                    var r = Robots.Load(site.RobotsContent);

                    foreach (var url in list)
                    {
                        if (url == null)
                        {
                            continue;
                        }
                        try
                        {
                            var httpRegex = new Regex(@"https?:\/\/");
                            var absoluteUrl = url;
                            if (!httpRegex.IsMatch(url))
                            {
                                absoluteUrl = page.Url + url;
                            }
                            if (r.IsPathAllowed(Crawler.CrawlerName, absoluteUrl))
                            {
                                Crawler.PostPage(new Uri(absoluteUrl), dbContext, frontier, page.Id).Wait();
                            }
                            else
                            {
                                Log.Information("Url: {0} is not allowed", absoluteUrl);
                            }
                        }
                        catch { }
                    }
                    scope.Dispose();
                }
            }));
        }