// Setting up bot config public void setup_abot() { CrawlConfiguration crawlConfig = new CrawlConfiguration(); crawlConfig.CrawlTimeoutSeconds = 150; crawlConfig.MaxConcurrentThreads = 25; crawlConfig.IsExternalPageCrawlingEnabled = false; crawlConfig.MaxCrawlDepth = 1; crawlConfig.MaxPagesToCrawl = 1000; crawlConfig.UserAgentString = "abot v1.0 http://code.google.com/p/abot"; crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { Regex rx = new Regex(@"\d{5}"); if (!rx.IsMatch(pageToCrawl.Uri.ToString()) && !pageToCrawl.Uri.ToString().Contains("text=")) return new CrawlDecision { Allow = false, Reason = "Want only comlinks" }; return new CrawlDecision { Allow = true, Reason = "OK Link" }; ; }); }
public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_CrawlDelayAboveMinDomainCrawlDelay_CallsDomainRateLimiter() { Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" }; CrawledPage page1 = new CrawledPage(uri1); CrawledPage page2 = new CrawledPage(uri2); List<Uri> links = new List<Uri> { uri1, uri2 }; _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage); _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1); _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny<string>())).Returns(3);//this is more then the max configured crawl delay (should be ignored) _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny<string>(), It.IsAny<string>())).Returns(true); _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny<Uri>())).Returns(_fakeRobotsDotText.Object); _dummyConfiguration.IsRespectRobotsDotTextEnabled = true;//BY HAVING A THIS EQUAL TO TRUE WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED _dummyConfiguration.MaxRobotsDotTextCrawlDelayInSeconds = 2; //This is less than the crawl delay (Should Be used) _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeHttpRequester.VerifyAll(); _fakeHyperLinkParser.VerifyAll(); _fakeRobotsDotText.VerifyAll(); _fakeRobotsDotTextFinder.VerifyAll(); _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny<Uri>(), 2000), Times.Exactly(1)); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED }
static void Main(string[] args) { CrawlConfiguration config = new CrawlConfiguration(); config.MaxConcurrentThreads = 1; // Web Extractor is not currently thread-safe. // Create the PhantomJS instance. This will spawn a new PhantomJS process using phantomjs.exe. // Make sure to dispose this instance or you will have a zombie process! IWebDriver driver = CreatePhantomJsDriver(config); // Create the content extractor that uses PhantomJS. IWebContentExtractor extractor = new JavaScriptContentExtractor(driver); // Create a PageRequester that will use the extractor. IPageRequester requester = new PageRequester(config, extractor); using (IWebCrawler crawler = new PoliteWebCrawler(config, null, null, null, requester, null, null, null, null)) { crawler.PageCrawlCompleted += OnPageCrawlCompleted; CrawlResult result = crawler.Crawl(new Uri("http://wvtesting2.com/")); if (result.ErrorOccurred) Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); else Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } Console.Read(); }
static void Main(string[] args) { CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(); PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; crawler.ShouldCrawlPage((crawledPage, crawledContext) => { CrawlDecision decision = new CrawlDecision(); var uri = crawledPage.Uri.ToString(); if (crawledPage.IsRoot || uri.StartsWith("http://www.tingchina.com/erge/")) { decision.Allow = true; } else { decision.Allow = false; decision.Reason = "Just erge pages!"; } return decision; }); CrawlResult result = crawler.Crawl(new Uri("http://www.tingchina.com/"), cancellationTokenSource); if (result.ErrorOccurred) Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); else Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); Console.ReadLine(); }
public void Crawl(Uri uri, Action<Page> callback) { var crawlConfig = new CrawlConfiguration { CrawlTimeoutSeconds = 0, MaxConcurrentThreads = 5, UserAgentString = "InspectionCrawler v1.0", MinCrawlDelayPerDomainMilliSeconds = 1000, MaxPagesToCrawl = 0, MaxPagesToCrawlPerDomain = 0, MaxCrawlDepth = int.MaxValue }; var crawler = new PoliteWebCrawler(crawlConfig); crawler.PageCrawlCompletedAsync += (sender, args) => { var page = args.CrawledPage; if (page.WebException != null && page.HttpWebResponse == null) { _log.Log(new LogMessage(LogType.Error, "Could not get page", page.WebException, page.Uri)); return; } callback(Convert(args.CrawledPage)); }; crawler.Crawl(uri); }
public void Crawl_MinCrawlDelayDelayZero_DomainRateLimiterNotCalled() { Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); CrawledPage homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; CrawledPage page1 = new CrawledPage(uri1); CrawledPage page2 = new CrawledPage(uri2); List<Uri> links = new List<Uri> { uri1, uri2 }; _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage); _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1); _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Never()); }
public WebCrawler(ICrawlingStats crawlingStats, IResultWriter resultWriter, IClock clock) { _crawlingStats = crawlingStats; _resultWriter = resultWriter; _clock = clock; _webCrawler = new PoliteWebCrawler(); _webCrawler.PageCrawlCompletedAsync += ProcessPageCrawlCompleted; _startCrawlingTime = _clock.FormattedCurrentTime(); //_resultFilePath = System.Configuration.ConfigurationManager.AppSettings["ResultFileName"]; }
public void Crawl_MaxPagesTo25_OnlyCrawls25Pages() { new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters")); CrawlConfiguration configuration = new CrawlConfiguration(); configuration.MaxPagesToCrawl = 25; int pagesCrawledCount = 0; PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null); crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++; crawler.Crawl(new Uri("http://localhost.fiddler:1111/")); Assert.AreEqual(25, pagesCrawledCount); }
public void Crawl_CrawlTimeoutIs1Sec_TimesOut() { CrawlConfiguration configuration = new CrawlConfiguration (); configuration.CrawlTimeoutSeconds = 1; int pagesCrawledCount = 0; PoliteWebCrawler crawler = new PoliteWebCrawler (configuration, null, null, null, null, null, null, null, null); crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++; CrawlResult result = crawler.Crawl (new Uri ("http://wvtesting2.com/")); Assert.IsFalse (result.ErrorOccurred); Assert.IsTrue (result.Elapsed.TotalSeconds < 5); Assert.IsTrue (pagesCrawledCount > 0); }
public void CrawlAsync() { PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(); CrawlResult result = crawler.Crawl(new Uri("http://www.funda.nl"), cancellationTokenSource); if (result.ErrorOccurred) Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); else Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); }
public void Crawl_MaxPagesTo5_WithCrawlDelay_OnlyCrawls5Pages() { new PageRequester(new CrawlConfiguration{ UserAgentString = "aaa" }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters")); CrawlConfiguration configuration = new CrawlConfiguration(); configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; //adding delay since it increases the chance of issues with abot crawling more than MaxPagesToCrawl. configuration.MaxPagesToCrawl = 5; int pagesCrawledCount = 0; PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null); crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++; crawler.Crawl(new Uri("http://localhost.fiddler:1111/")); Assert.AreEqual(5, pagesCrawledCount); }
public void Crawl_Asynchronous_CancellationTokenCancelled_StopsCrawl() { CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(); System.Timers.Timer timer = new System.Timers.Timer(800); timer.Elapsed += (o, e) => { cancellationTokenSource.Cancel(); timer.Stop(); timer.Dispose(); }; timer.Start(); PoliteWebCrawler crawler = new PoliteWebCrawler(); Task<CrawlResult> task = Task.Factory.StartNew<CrawlResult>(() => crawler.Crawl(new Uri("http://localhost.fiddler:1111/"), cancellationTokenSource)); CrawlResult result = task.Result; Assert.IsTrue(result.ErrorOccurred); Assert.IsTrue(result.ErrorException is OperationCanceledException); }
public void Crawl_IsRateLimited() { new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters")); CrawlConfiguration configuration = new CrawlConfiguration(); configuration.MaxPagesToCrawl = 3; configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; // 1 second * 2 pages = 2 (or more) seconds int pagesCrawledCount = 0; var crawler = new PoliteWebCrawler(configuration); crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++; var uriToCrawl = new Uri("http://localhost.fiddler:1111/"); var start = DateTime.Now; crawler.Crawl(uriToCrawl); var elapsed = DateTime.Now - start; Assert.GreaterOrEqual(elapsed.TotalMilliseconds, 2000); Assert.AreEqual(3, pagesCrawledCount); }
private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler() { IWebCrawler crawler = new PoliteWebCrawler(); //Register a lambda expression that will make Abot not crawl any url that has the word "ghost" in it. //For example http://a.com/ghost, would not get crawled if the link were found during the crawl. //If you set the log4net log level to "DEBUG" you will see a log message when any page is not allowed to be crawled. //NOTE: This is lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPage method is run. crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { if (pageToCrawl.Uri.AbsoluteUri.Contains("ghost")) return new CrawlDecision { Allow = false, Reason = "Scared of ghosts" }; return new CrawlDecision { Allow = true }; }); //Register a lambda expression that will tell Abot to not download the page content for any page after 5th. //Abot will still make the http request but will not read the raw content from the stream //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldDownloadPageContent method is run crawler.ShouldDownloadPageContent((crawledPage, crawlContext) => { if (crawlContext.CrawledCount >= 5) return new CrawlDecision { Allow = false, Reason = "We already downloaded the raw page content for 5 pages" }; return new CrawlDecision { Allow = true }; }); //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri. //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) => { if (!crawledPage.IsInternal) return new CrawlDecision { Allow = false, Reason = "We dont crawl links of external pages" }; return new CrawlDecision { Allow = true }; }); return crawler; }
static void Main(string[] args) { //Read Configuration from File connectionString = fileTyp.GetConnectionString(appDataPath); webURL = fileTyp.GetHostToCrawlString(appDataPath); //Will Get the FileTypes to Download filters = fileTyp.GetFileTypesToDownlaod(fileTypePath); //Will use app.config for confguration PoliteWebCrawler crawler = new PoliteWebCrawler(); #region "Crawler Events" crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; #endregion CrawlResult result = crawler.Crawl(new Uri(webURL)); if (result.ErrorOccurred) Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.ToString()); else Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); }
public static Dictionary<String, CrawledWebPage> Run() { PoliteWebCrawler crawler = new PoliteWebCrawler(_crawlConfig, null, null, null, null, null, null, null, null); //Registers on the events crawler.PageCrawlStartingAsync += ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += PageLinksCrawlDisallowed; UriBag uris = new UriBag(); uris.Add("carris", "http://www.carris.pt"); uris.Add("tfl", "http://www.tfl.gov.uk/"); uris.Add("publico", "http://www.publico.pt"); uris.Add("CP", "http://www.cp.pt"); uris.Add("GVB", "http://en.gvb.nl/"); crawler.Crawl(uris.Get("CP")); return _crawledPages; //Tries to save the webpages information //Utils.Save(CrawledPages); }
public void Crawl_MinCrawlDelayGreaterThanZero_CallsDomainRateLimiter() { Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" }; CrawledPage page1 = new CrawledPage(uri1); CrawledPage page2 = new CrawledPage(uri2); List<Uri> links = new List<Uri> { uri1, uri2 }; _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage); _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1); _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _dummyConfiguration.MinCrawlDelayPerDomainMilliSeconds = 1;//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED }
public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_ZeroCrawlDelay_DoesNotCallsDomainRateLimiter() { Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" }; CrawledPage page1 = new CrawledPage(uri1); CrawledPage page2 = new CrawledPage(uri2); List<Uri> links = new List<Uri> { uri1, uri2 }; _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage); _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1); _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page2); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is<CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny<string>())).Returns(0); _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny<string>(), It.IsAny<string>())).Returns(true); _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny<Uri>())).Returns(_fakeRobotsDotText.Object); _dummyConfiguration.IsRespectRobotsDotTextEnabled = true; _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeHttpRequester.VerifyAll(); _fakeHyperLinkParser.VerifyAll(); _fakeRobotsDotText.VerifyAll(); _fakeRobotsDotTextFinder.VerifyAll(); _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny<Uri>(), It.IsAny<long>()), Times.Exactly(0)); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Exactly(0)); }
public void Crawl_MinCrawlDelayDelayZero_StillCallsDomainRateLimiter() { CrawledPage homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(homePage); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny<CrawledPage>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny<Uri>()), Times.Exactly(1)); }
public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_RootPageIsAllowed_AllPagesBelowDisallowed_IsIgnoreRobotsDotTextIfRootDisallowedEnabledTrue_CallsHttpRequester() { CrawledPage homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; CrawledPage page1 = new CrawledPage(_rootUri); _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(_rootUri.AbsoluteUri, It.IsAny<string>())).Returns(true); _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(_rootUri.AbsoluteUri + "aaaaa", It.IsAny<string>())).Returns(false); _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny<Uri>())).Returns(_fakeRobotsDotText.Object); _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny<Func<CrawledPage, CrawlDecision>>())).Returns(page1); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny<PageToCrawl>(), It.IsAny<CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _dummyConfiguration.IsRespectRobotsDotTextEnabled = true; _dummyConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled = true; _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeCrawlDecisionMaker.VerifyAll(); _fakeRobotsDotText.VerifyAll(); _fakeRobotsDotTextFinder.VerifyAll(); _fakeHttpRequester.VerifyAll(); }
public void Crawl_CrawlTimeoutIs1Sec_TimesOut() { CrawlConfiguration configuration = new CrawlConfiguration(); configuration.CrawlTimeoutSeconds = 2; int pagesCrawledCount = 0; PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null); crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++; CrawlResult result = crawler.Crawl(new Uri("http://localhost.fiddler:1111/")); Assert.IsFalse(result.ErrorOccurred); Assert.IsTrue(result.Elapsed.TotalSeconds < 8, "Took more than 8 seconds"); Assert.IsTrue(pagesCrawledCount < 2, "Crawled more than 2 pages"); }