public void IsPathAllowed_StarWildcard(string rule, string path, Boolean result) { string s = @"User-agent: *" + nl + "Disallow: " + rule; Robots r = Robots.Load(s); Assert.Equal(result, r.IsPathAllowed("*", path)); }
public void Then_disallows_should_be_respected() { const string robotsTxt = "RobotsFixtures\\example2.txt"; Robots.Load(File.ReadAllText(robotsTxt)); var lines = new Queue <string>(File.ReadAllLines(robotsTxt)); while (lines.Peek() != "User-agent: *") { lines.Dequeue(); } lines.Dequeue(); while (!lines.Peek().StartsWith("#")) { var line = lines.Dequeue(); if (string.IsNullOrEmpty(line)) { continue; } var rule = line.Split(' ')[1].Replace("*", "foo"); Robots.PathIsAllowed(rule).Should().BeFalse(); } Robots.PathIsAllowed("/path").Should().BeTrue(); }
public void TestFixtureSetUp() { var path = Path.Combine(Environment.CurrentDirectory, "RobotsFixtures\\robots.txt"); var robots = File.OpenText(path).ReadToEnd(); Robots.Load(robots, "bot"); }
public void IsPathAllowedRuleWithoutUserAgentTrue() { string s = "Disallow: /"; Robots r = Robots.Load(s); Assert.True(r.IsPathAllowed("*", "/foo")); }
public bool IsPathAllowed_StarWildcard(string rule, string path) { string s = @"User-agent: *" + nl + "Disallow: " + rule; Robots r = Robots.Load(s); return(r.IsPathAllowed("*", path)); }
public void IsPathAllowed_DollarWildcard_False( string path) { string s = @"User-agent: *" + nl + "Disallow: /*.gif$"; Robots r = Robots.Load(s); Assert.False(r.IsPathAllowed("*", path)); }
public void CrawlDelayRuleWithoutUserAgent() { string s = "Crawl-delay: 1"; Robots r = Robots.Load(s); Assert.AreNotEqual(1000, r.CrawlDelay("Google")); Assert.AreEqual(0, r.CrawlDelay("Google")); }
public void IsPathAllowed_DollarWildcard_True( [Values("asd", "a.gifa", "a.gif$")] string path) { string s = @"User-agent: *" + nl + "Disallow: /*.gif$"; Robots r = Robots.Load(s); Assert.True(r.IsPathAllowed("*", path)); }
public void IsPathAllowedDollarWildcardFalse( [Values("a.gif", "foo.gif", "b.a.gif", "a.gif.gif")] string path) { string s = @"User-agent: *" + this.newLine + "Disallow: /*.gif$"; Robots r = Robots.Load(s); Assert.False(r.IsPathAllowed("*", path)); }
public void IsPathAllowedStarWildcard(string rule, string path, bool result) { string s = @"User-agent: *" + this.newLine + "Disallow: " + rule; Robots r = Robots.Load(s); r.IsPathAllowed("*", path); Assert.AreEqual(result, r.IsPathAllowed("*", path)); }
public void IsPathAllowed_PathShouldBeCaseSensitive_True( string rule, string path) { string s = @"User-agent: *" + nl + "Disallow: " + rule; Robots r = Robots.Load(s); Assert.True(r.IsPathAllowed("*", path)); }
public void IsPathAllowedPathShouldBeCaseSensitiveTrue( [Values("/dir/file.ext", "/dir/file.ext", "/*/file.html", "/*.gif$")] string rule, [Values("/dir/File.ext", "/Dir/file.ext", "/a/File.html", "a.GIF")] string path) { string s = @"User-agent: *" + this.newLine + "Disallow: " + rule; Robots r = Robots.Load(s); Assert.True(r.IsPathAllowed("*", path)); }
public bool CanIGoThere(UrlItem url, out long crawlDelay) { crawlDelay = 0; if (!RobotEnabled) { return(true); } if (!_allowedUrl.GetOrAdd(url.Url, false)) { var content = _robots.GetOrAdd(url.Host, default(Robots)); if (content == default) { try { var response = WebRequest.Create("https://" + url.Host + "/robots.txt").GetResponse() as HttpWebResponse; switch (response.StatusCode) { case HttpStatusCode.OK: using (Stream dataStream = response.GetResponseStream()) { StreamReader reader = new StreamReader(dataStream); string responseFromServer = reader.ReadToEnd(); Robots robots = Robots.Load(responseFromServer); _robots.AddOrUpdate(url.Host, robots, (k, v) => robots); } break; case HttpStatusCode.NotFound: default: Console.WriteLine($"Robots file cannot be read for {url.Host}"); _allowedUrl.AddOrUpdate(url.Url, true, (k, v) => true); return(true); } } catch (Exception ex) { Console.WriteLine($"Robots file cannot be read for {url.Host}, Reason {ex.Message}"); return(true); } } crawlDelay = _robots[url.Host].CrawlDelay("*"); var s = url.Url.Split(new string[] { url.Host }, StringSplitOptions.None); if (s.Length > 1 && !string.IsNullOrWhiteSpace(s[1])) { return(_robots[url.Host].IsPathAllowed("*", s[1])); } else { return(_robots[url.Host].IsPathAllowed("*", url.Url)); } } return(true); }
public void Setup() { _robots = Robots.Load(string.Empty); _processedList = new List <string> { "http://gocardless.com/test2" }; _toProcessList = new Stack <string>(); _masterDomain = "gocardless.com"; _absolutePath = "http://gocardless.com"; _linkProcessor = new LinkProcessor(_robots, _processedList, _toProcessList, _masterDomain, _absolutePath); }
/// <summary> /// Crawls a site. /// </summary> /// <param name="site">The site.</param> /// <param name="loginId">The login identifier.</param> /// <param name="password">The password.</param> /// <returns></returns> public int CrawlSite(Site site, string loginId, string password) { // Delete the indicies for the site that is being indexed. IndexContainer.DeleteDocumentByProperty(typeof(SitePageIndex), "SiteId", site.Id); _site = site; _startUrl = _site.IndexStartingLocation; var startingUri = new Uri(_startUrl); _baseUrl = startingUri.Scheme + "://" + startingUri.Authority; var baseUri = new Uri(_baseUrl); // get the robot helper class up and running var robotsUri = new Uri(baseUri, "robots.txt"); var robotsTxt = GetWebText(robotsUri); _robotHelper = Robots.Load(robotsTxt); _cookieContainer = new CookieContainer(); // If a loginId and password were included, get an authentication cookie if (loginId.IsNotNullOrWhiteSpace() && password.IsNotNullOrWhiteSpace()) { var loginParam = new LoginParameters(); loginParam.Username = loginId; loginParam.Password = password; loginParam.Persisted = false; var authUri = new Uri(baseUri, "api/Auth/Login"); var restClient = new RestClient(authUri); restClient.CookieContainer = _cookieContainer; var request = new RestRequest(Method.POST); request.RequestFormat = DataFormat.Json; request.AddBody(loginParam); var response = restClient.Execute(request); } _urlQueue.Enqueue(_site.IndexStartingLocation); while (_urlQueue.Any()) { string url = _urlQueue.Dequeue().Replace("?", "\\?"); CrawlPage(url); } return(_previouslyCrawledPages.Count); }
public void IsPathAllowed_UserAgentStringCaseInsensitive_False( string userAgent) { string s = @"User-agent: Slurp Disallow: / User-agent: Exabot Disallow: / User-agent: Exabot Disallow: / User-agent: figtree Disallow: /"; Robots r = Robots.Load(s); Assert.False(r.IsPathAllowed(userAgent, "/dir")); }
public void IsPathAllowedUserAgentStringCaseInsensitiveFalse( [Values("Slurp", "slurp", "Exabot", "exabot", "FigTree/0.1 Robot libwww-perl/5.04")] string userAgent) { string s = @"User-agent: Slurp Disallow: / User-agent: Exabot Disallow: / User-agent: Exabot Disallow: / User-agent: figtree Disallow: /"; Robots r = Robots.Load(s); Assert.False(r.IsPathAllowed(userAgent, "/dir")); }
private async Task <Robots> GetRobotTxt(Uri urlToCrawl) { try { using (HttpResponseMessage robotsTxt = await _httpClient.GetAsync(urlToCrawl.AbsoluteUri + "/robots.txt")) { if (robotsTxt.IsSuccessStatusCode) { return(Robots.Load(await robotsTxt.Content.ReadAsStringAsync())); } } } catch (Exception e) { System.Console.Write("\nGetRobot: "); System.Console.Write(e.Message); } return(null); }
public async Task <IReadOnlyCollection <Uri> > ExtractLinks(CrawlRequest crawlRequest, HttpContent content) { if (crawlRequest.Url.PathAndQuery.Equals("/robots.txt", StringComparison.OrdinalIgnoreCase) == false) { return(new Uri[0]); } var contentString = await content.ReadAsStringAsync(); try { var robots = Robots.Load(contentString); return(robots.Sitemaps.Select(s => s.Url).ToList()); } catch { return(new Uri[0]); } }
public static RobotsTxtReport CheckRobotsTxt(string url, Options providedOptions) { var robotsTxtHost = string.IsNullOrWhiteSpace(providedOptions.ServerHostName) ? url : providedOptions.ServerHostName; Uri robotsTxtUri = RobotsTxtUri(robotsTxtHost); bool robotsTxtExists; var robotsContent = string.Empty; try { robotsContent = string.IsNullOrWhiteSpace(providedOptions.ServerHostName) ? Helpers.NetworkHelper.GetString(robotsTxtUri) : Helpers.NetworkHelper.GetString(robotsTxtUri, url); //something was downloaded, and we need to check, if it is not empty now robotsTxtExists = !string.IsNullOrWhiteSpace(robotsContent); } catch (Exception exception) { Console.WriteLine($"Exception received when trying to download {robotsTxtUri.ToString()}"); Console.WriteLine($"Exception: {exception.Message}"); Console.WriteLine($"Exception stracktrace {exception.StackTrace}"); robotsTxtExists = false; } var robotsFile = robotsTxtExists ? Robots.Load(robotsContent) : Robots.Load(string.Empty); var getCheckStatus = GetCheckStatus(robotsFile, providedOptions.CrawlingDenied); var sitemapsIsAccessible = CheckSitemapIsAccessible(robotsFile); if (HaveSitemaps(robotsFile)) { //if robots.txt check is OK, but sitemaps, defined in it is not accessible - we shall fail the check getCheckStatus = getCheckStatus ? sitemapsIsAccessible : getCheckStatus; } var report = new RobotsTxtReport { Url = url, CheckStatus = getCheckStatus, RobotsTxtExists = robotsTxtExists, SitemapsIsAccessible = sitemapsIsAccessible, Robots = robotsFile }; return(report); }
/// <summary> /// Crawls a site. /// </summary> /// <param name="site">The site.</param> /// <param name="loginId">The login identifier.</param> /// <param name="password">The password.</param> /// <returns></returns> public int CrawlSite(Site site, string loginId, string password) { _site = site; // get the robot helper class up and running _robotHelper = Robots.Load(_site.IndexStartingLocation); _startUrl = _site.IndexStartingLocation; var startingUri = new Uri(_startUrl); _baseUrl = startingUri.Scheme + "://" + startingUri.Authority; _cookieContainer = new CookieContainer(); // If a loginId and password were included, get an authentication cookie if (loginId.IsNotNullOrWhitespace() && password.IsNotNullOrWhitespace()) { var loginParam = new LoginParameters(); loginParam.Username = loginId; loginParam.Password = password; loginParam.Persisted = false; var baseUri = new Uri(_baseUrl); var authUri = new Uri(baseUri, "api/Auth/Login"); var restClient = new RestClient(authUri); restClient.CookieContainer = _cookieContainer; var request = new RestRequest(Method.POST); request.RequestFormat = DataFormat.Json; request.AddBody(loginParam); var response = restClient.Execute(request); } CrawlPage(_site.IndexStartingLocation); return(_previouslyCrawledPages.Count); }
public Robots ReadRobotsTxt(string path) { return(Robots.Load(_webClientService.DownloadString($"{path}/robots.txt"))); }
public void Then_PathIsAllowed_should_return_true() { Robots.Load(null); Robots.PathIsAllowed("").Should().BeTrue(); }
public void Then_no_exception_is_thrown() { Action action = () => Robots.Load(null); action.ShouldNotThrow(); }
public static TransformBlock <Page, Page> GetBlock(IServiceScopeFactory scopeFactory, BufferBlock <Page> frontier) { return(new TransformBlock <Page, Page>(async page => { if (page == null) { return null; } try { var domainRegex = new Regex(@"https?:\/\/(.+?)\/"); var domain = domainRegex.Match(page.Url.ToString()).Groups[1].Value; var scope = scopeFactory.CreateScope(); var dbContext = (Models.DbContext)scope.ServiceProvider.GetService(typeof(Models.DbContext)); Site site; lock (Crawler.lockObj) { site = dbContext.Site.Where(s => s.Domain == domain).FirstOrDefault(); } if (site == null) { var client = new HttpClient(); HttpResponseMessage response = null; try { response = await client.GetAsync("http://" + domain + "/robots.txt"); } catch { } string robotsContent = null, sitemapContent = null; if (response?.IsSuccessStatusCode ?? false) { robotsContent = await response.Content.ReadAsStringAsync(); var r = Robots.Load(robotsContent); if (r.Sitemaps.Count > 0) { response = await client.GetAsync(r.Sitemaps[0].Url); if (response.IsSuccessStatusCode) { sitemapContent = await response.Content.ReadAsStringAsync(); } } } lock (Crawler.lockObj) { EntityEntry <Site> entityEntry = dbContext.Site.Add(new Site() { Domain = domain, RobotsContent = robotsContent, SitemapContent = sitemapContent }); site = entityEntry.Entity; dbContext.SaveChanges(); } Log.Information("Site from entity: {0} {Id}", site.Domain, site.Id); if (sitemapContent != null) { var sitemap = new SitemapParser().Parse(sitemapContent); foreach (var item in sitemap.Items) { await Crawler.PostPage(item.Location, dbContext, frontier, null); } } } scope.Dispose(); page.SiteId = site.Id; } catch (Exception e) { Log.Error(e, "Site loader exception"); } return page; })); }
public static ActionBlock <Page> GetBlock(IServiceScopeFactory scopeFactory, BufferBlock <Page> frontier) { return(new ActionBlock <Page>(async page => { if (page == null) { return; } Log.Information("Link scraper {0}", page.Url); // If page is not html if (page.document == null) { return; } var redirectRegex = new Regex("(?:document\\.location|location\\.href)\\s?=\\s?(?:'|\")([^'\"])+(?:'|\")"); var redirects = redirectRegex.Matches(page.HtmlContent).Select(m => m.Groups[1].Value); var links = page.document.QuerySelectorAll("a").Select(l => l.GetAttribute("href")); Log.Information("Link scraper found {0} links and {1} redirects", links.Count(), redirects.Count()); var list = links.ToList(); list.AddRange(redirects); lock (Crawler.lockObj) { var scope = scopeFactory.CreateScope(); var dbContext = scope.ServiceProvider.GetService <DbContext>(); var site = dbContext.Site.Where(d => d.Id == page.SiteId).FirstOrDefault(); // Log.Information("Site: {0} for page: {1}", site.Domain, page.Url); var r = Robots.Load(site.RobotsContent); foreach (var url in list) { if (url == null) { continue; } try { var httpRegex = new Regex(@"https?:\/\/"); var absoluteUrl = url; if (!httpRegex.IsMatch(url)) { absoluteUrl = page.Url + url; } if (r.IsPathAllowed(Crawler.CrawlerName, absoluteUrl)) { Crawler.PostPage(new Uri(absoluteUrl), dbContext, frontier, page.Id).Wait(); } else { Log.Information("Url: {0} is not allowed", absoluteUrl); } } catch { } } scope.Dispose(); } })); }