public async Task Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_RootPageIsAllowed_AllPagesBelowDisallowed_IsIgnoreRobotsDotTextIfRootDisallowedEnabledTrue_CallsHttpRequester() { CrawledPage homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; CrawledPage page1 = new CrawledPage(_rootUri); _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(_rootUri.AbsoluteUri, It.IsAny <string>())).Returns(true); _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(_rootUri.AbsoluteUri + "aaaaa", It.IsAny <string>())).Returns(false); _fakeRobotsDotTextFinder.Setup(f => f.FindAsync(It.IsAny <Uri>())).Returns(Task.FromResult(_fakeRobotsDotText.Object)); _fakeHttpRequester.Setup(f => f.MakeRequestAsync(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page1)); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _dummyConfiguration.IsRespectRobotsDotTextEnabled = true; _dummyConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled = true; _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); await _unitUnderTest.CrawlAsync(_rootUri); _fakeCrawlDecisionMaker.VerifyAll(); _fakeRobotsDotText.VerifyAll(); _fakeRobotsDotTextFinder.VerifyAll(); _fakeHttpRequester.VerifyAll(); }
public async Task Crawl_MaxPagesTo25_OnlyCrawls25Pages() { await new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequestAsync(new Uri("http://localhost:1111/PageGenerator/ClearCounters")); CrawlConfiguration configuration = new CrawlConfiguration(); configuration.MaxPagesToCrawl = 25; configuration.IsExternalPageCrawlingEnabled = true; configuration.IsExternalPageLinksCrawlingEnabled = true; int pagesCrawledCount = 0; PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null); crawler.PageCrawlCompleted += (a, b) => { pagesCrawledCount++; }; var res = await crawler.CrawlAsync(new Uri("http://localhost:1111/")); Assert.AreEqual(25, pagesCrawledCount); }
public async Task ExtractRecipesAsync(Uri recipeWebsiteUri, CancellationToken cancellationToken = default) { var crawlResult = await _politeWebCrawler.CrawlAsync(recipeWebsiteUri); //TODO: Error handling //TODO: Logging }
public async Task Crawl_IsRateLimited() { await new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequestAsync(new Uri("http://localhost:1111/PageGenerator/ClearCounters")); CrawlConfiguration configuration = new CrawlConfiguration(); configuration.MaxPagesToCrawl = 3; configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; // 1 second * 2 pages = 2 (or more) seconds int pagesCrawledCount = 0; var crawler = new PoliteWebCrawler(configuration); crawler.PageCrawlCompleted += (a, b) => pagesCrawledCount++; var uriToCrawl = new Uri("http://localhost:1111/"); var start = DateTime.Now; await crawler.CrawlAsync(uriToCrawl); var elapsed = DateTime.Now - start; Assert.GreaterOrEqual(elapsed.TotalMilliseconds, 2000); Assert.AreEqual(3, pagesCrawledCount); }
public async Task Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_PageIsDisallowed_DoesNotCallHttpRequester() { var homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(0); _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(false); _fakeRobotsDotTextFinder.Setup(f => f.FindAsync(It.IsAny <Uri>())).Returns(Task.FromResult(_fakeRobotsDotText.Object)); _dummyConfiguration.IsRespectRobotsDotTextEnabled = true; _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHtmlParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); await _unitUnderTest.CrawlAsync(_rootUri); _fakeHttpRequester.Setup(f => f.MakeRequestAsync(It.IsAny <Uri>(), It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(homePage)); _fakeRobotsDotText.VerifyAll(); _fakeRobotsDotTextFinder.VerifyAll(); _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny <Uri>(), It.IsAny <long>()), Times.Exactly(0)); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(0)); }
public void Test(Uri uri) { pageCount = 0; baseUri = uri; string message; CrawlConfiguration crawlConfiguration = new CrawlConfiguration(); crawlConfiguration.MaxConcurrentThreads = 4; crawlConfiguration.UserAgentString = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/60.0.3112.113 Safari/537.36 bot"; crawlConfiguration.MaxPagesToCrawl = 10000; crawlConfiguration.DownloadableContentTypes = "text/html, text/plain, image/jpeg, image/pjpeg, image/png"; crawlConfiguration.CrawlTimeoutSeconds = 100; crawlConfiguration.MinCrawlDelayPerDomainMilliSeconds = 1000; using PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfiguration); crawler.PageCrawlStarting += ProcessPageCrawlStarted; crawler.PageCrawlCompleted += ProcessPageCrawlCompleted; CrawlResult result = crawler.CrawlAsync(baseUri).Result; if (result.ErrorOccurred) { message = StringTable.GetString( "CRAWL_COMPLETE_ERROR", CultureInfo.InstalledUICulture); Log.InfoFormat( CultureInfo.InvariantCulture, message, result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { message = StringTable.GetString( "CRAWL_COMPLETE_NO_ERROR", CultureInfo.InstalledUICulture); Log.InfoFormat( CultureInfo.InvariantCulture, message, result.RootUri.AbsoluteUri); } message = StringTable.GetString( "TOTAL_PAGES", CultureInfo.InstalledUICulture); Log.InfoFormat( CultureInfo.InvariantCulture, message, pageCount.ToString(CultureInfo.InvariantCulture)); }
private static async Task DemoSimpleCrawler() { var config = new CrawlConfiguration { UserAgentString = "2019RLCrawlAThon", MaxPagesToCrawl = 0, MinCrawlDelayPerDomainMilliSeconds = 10, }; var start = new Uri("https://thailand.kyocera.com/"); var crawler = new PoliteWebCrawler( config, new BetterDecisionMaker(start), null, new Scheduler(false, null, new PriorityUriRepository()), null, null, null, null, null); var files = new HashSet <string>(); var decMaker = new CrawlDecisionMaker(); var batch = new HashSet <string>(); crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted; crawler.PageCrawlCompleted += (sender, e) => { if (new[] { ".exe", ".zip", ".tar" }.Any(c => e.CrawledPage.Uri.AbsolutePath.Contains(c))) { lock (files) { Console.WriteLine("Found file: " + e.CrawledPage.Uri.Host + e.CrawledPage.Uri.LocalPath); Console.WriteLine(e.CrawledPage.CrawlDepth); if (!files.Contains(e.CrawledPage.Uri.ToString())) { files.Add(e.CrawledPage.Uri.ToString()); batch.Add(e.CrawledPage.Uri.ToString()); if (batch.Count >= 10) { using (var httpClient = new HttpClient()) { using (var request = new HttpRequestMessage(new HttpMethod("POST"), "http://hackathon.reversinglabs.com/api/test/bulk")) { var base64authorization = Convert.ToBase64String(Encoding.ASCII.GetBytes("tztok_jadnici:7@dQ6dqq7YZggcd")); request.Headers.TryAddWithoutValidation("Authorization", $"Basic {base64authorization}"); var body = "{\"crawlathon\": {\"query\": {\"site\": \"filehippo\", \"links\":[" + string.Join(", ", batch.Select(s => "\"" + s + "\"")) + "]}}}"; request.Content = new StringContent(body, Encoding.UTF8, "application/json"); var resp = httpClient.SendAsync(request).Result; batch.Clear(); } } } } } } }; var crawlResult = await crawler.CrawlAsync(start); }
public async Task Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_CrawlDelayAboveMinDomainCrawlDelay_CallsDomainRateLimiter() { var uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); var uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); var homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; var page1 = new CrawledPage(uri1); var page2 = new CrawledPage(uri2); var links = new List <HyperLink> { new HyperLink() { HrefValue = uri1 }, new HyperLink() { HrefValue = uri2 } }; _fakeHttpRequester.Setup(f => f.MakeRequestAsync(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(homePage)); _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page1)); _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page2)); _fakeHtmlParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldRecrawlPage(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = false }); _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(3);//this is more then the max configured crawl delay (should be ignored) _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(true); _fakeRobotsDotTextFinder.Setup(f => f.FindAsync(It.IsAny <Uri>())).Returns(Task.FromResult(_fakeRobotsDotText.Object)); _dummyConfiguration.IsRespectRobotsDotTextEnabled = true; //BY HAVING A THIS EQUAL TO TRUE WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED _dummyConfiguration.MaxRobotsDotTextCrawlDelayInSeconds = 2; //This is less than the crawl delay (Should Be used) _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHtmlParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); await _unitUnderTest.CrawlAsync(_rootUri); _fakeHttpRequester.VerifyAll(); _fakeHtmlParser.VerifyAll(); _fakeRobotsDotText.VerifyAll(); _fakeRobotsDotTextFinder.VerifyAll(); _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny <Uri>(), 2000), Times.Exactly(1)); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED }
public async Task Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_ZeroCrawlDelay_StillCallsDomainRateLimiter() { var uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); var uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); var homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; var page1 = new CrawledPage(uri1); var page2 = new CrawledPage(uri2); var links = new List <HyperLink> { new HyperLink() { HrefValue = uri1 }, new HyperLink() { HrefValue = uri2 } }; _fakeHttpRequester.Setup(f => f.MakeRequestAsync(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(homePage)); _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page1)); _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page2)); _fakeHtmlParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldRecrawlPage(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = false }); _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(0); _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(true); _fakeRobotsDotTextFinder.Setup(f => f.FindAsync(It.IsAny <Uri>())).Returns(Task.FromResult(_fakeRobotsDotText.Object)); _dummyConfiguration.IsRespectRobotsDotTextEnabled = true; _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHtmlParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); await _unitUnderTest.CrawlAsync(_rootUri); _fakeHttpRequester.VerifyAll(); _fakeHtmlParser.VerifyAll(); _fakeRobotsDotText.VerifyAll(); _fakeRobotsDotTextFinder.VerifyAll(); _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny <Uri>(), It.IsAny <long>()), Times.Exactly(0)); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(3)); }
public async Task GetDoc() { var config = new CrawlConfiguration { MaxPagesToCrawl = 20, }; var crawler = new PoliteWebCrawler(config); crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted; var crawlResult = await crawler.CrawlAsync(new Uri("https://plantuml.com/")); }
private static async Task DemoSimpleCrawler() { var config = new CrawlConfiguration { MaxPagesToCrawl = 10, //Only crawl 10 pages MinCrawlDelayPerDomainMilliSeconds = 3000 //Wait this many millisecs between requests }; var crawler = new PoliteWebCrawler(config); crawler.PageCrawlCompleted += PageCrawlCompleted;//Several events available... var crawlResult = await crawler.CrawlAsync(new Uri("https://www.onet.pl")); }
private static async Task DemoSimpleCrawler() { var config = new CrawlConfiguration { MaxPagesToCrawl = 25, MinCrawlDelayPerDomainMilliSeconds = 3000 }; var crawler = new PoliteWebCrawler(config); crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted; var crawlResult = await crawler.CrawlAsync(new Uri("http://wvtesting2.com")); }
public async Task DownloadAllArticles() { var config = new CrawlConfiguration { MaxPagesToCrawl = 300, MinCrawlDelayPerDomainMilliSeconds = 300 }; var crawler = new PoliteWebCrawler(config); crawler.PageCrawlCompleted += PageCrawlCompleted; crawler.PageCrawlCompleted += Crawler_ProcessPageCrawlCompleted; var crawlResult = await crawler.CrawlAsync(new Uri(siteUrl)); }
/// <summary> /// Initiate crawling with /// List of urls to be crawled /// Search text to be found /// configure crawling /// </summary> /// <param name="UrlsList"></param> /// <param name="SearchText"></param> /// <returns></returns> private static async Task SimpleCrawler(List <string> UrlsList, string SearchText) { var ListPagesUrs = new List <string>(); int TotalNumer = 0; //loop through a list foreach (var el in UrlsList) { var config = new CrawlConfiguration { MaxPagesToCrawl = 10, //Only crawl 50 pages MinCrawlDelayPerDomainMilliSeconds = 3000 //Wait this many millisecs between requests }; // use polite crawler var crawler = new PoliteWebCrawler(config); crawler.PageCrawlStarting += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompleted += PageCrawlCompleted;//Several events available... crawler.PageCrawlDisallowed += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowed += crawler_PageLinksCrawlDisallowed; //Setup Custom class to store data crawler.CrawlBag.CrawlExtension = new CrawlExtension() { SearchWord = SearchText, ListUrls = new System.Collections.Generic.List <string>() }; var test = new List <string>(); var crawlResult = await crawler.CrawlAsync(new Uri(el)); List <string> valueList = crawler.CrawlBag.CrawlExtension.ListUrls; // add urls found to list ListPagesUrs.AddRange(valueList); TotalNumer = TotalNumer + crawler.CrawlBag.CrawlExtension.NumberSearchFound;//increment values found } Console.WriteLine("==== Total number of sites with word [{0}] is {1}", SearchText, TotalNumer); Console.WriteLine("==== Site lists with word {0} =====\r", SearchText); foreach (var el in ListPagesUrs) { Console.WriteLine(el); } }
private static async Task DemoSimpleCrawler() { var config = new CrawlConfiguration { MaxPagesToCrawl = 25, //Crawl 25 pages MinCrawlDelayPerDomainMilliSeconds = 3000 // Wait 3 seconds between requests }; var crawler = new PoliteWebCrawler(config); var siteToCrawl = "http://wiprodigital.com"; crawler.PageCrawlCompleted += PageCrawlCompleted; var crawlResult = await crawler.CrawlAsync(new Uri(siteToCrawl)); }
private static async Task RecipeCrawler(string url) { var config = new CrawlConfiguration { MaxPagesToCrawl = 100, MaxConcurrentThreads = 2, IsUriRecrawlingEnabled = false, MinCrawlDelayPerDomainMilliSeconds = 3000, MaxCrawlDepth = 1 }; var crawler = new PoliteWebCrawler(config); crawler.PageCrawlCompleted += PageCrawlCompleted;//Several events available... var crawlResult = await crawler.CrawlAsync(new Uri(url)); }
public async Task Crawl_MinCrawlDelayGreaterThanZero_CallsDomainRateLimiter() { var uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); var uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); var homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; var page1 = new CrawledPage(uri1); var page2 = new CrawledPage(uri2); var links = new List <HyperLink> { new HyperLink() { HrefValue = uri1 }, new HyperLink() { HrefValue = uri2 } }; _fakeHttpRequester.Setup(f => f.MakeRequestAsync(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(homePage)); _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page1)); _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page2)); _fakeHtmlParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldRecrawlPage(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = false }); _dummyConfiguration.MinCrawlDelayPerDomainMilliSeconds = 1;//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHtmlParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); await _unitUnderTest.CrawlAsync(_rootUri); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED }
public async Task Crawl_MaxPagesTo25_OnlyCrawls25Pages() { var configuration = new CrawlConfiguration(); configuration.MaxPagesToCrawl = 25; var pagesCrawledCount = 0; var crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null); crawler.PageCrawlCompleted += (a, b) => pagesCrawledCount++; await crawler.CrawlAsync(new Uri("http://localhost.fiddler:1111/")); Assert.AreEqual(25, pagesCrawledCount); }
public async Task Crawl_MaxPagesTo5_WithCrawlDelay_OnlyCrawls5Pages() { var configuration = new CrawlConfiguration(); configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; //adding delay since it increases the chance of issues with abot crawling more than MaxPagesToCrawl. configuration.MaxPagesToCrawl = 5; var pagesCrawledCount = 0; var crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null); crawler.PageCrawlCompleted += (a, b) => pagesCrawledCount++; await crawler.CrawlAsync(new Uri("http://localhost.fiddler:1111/")); Assert.AreEqual(5, pagesCrawledCount); }
private static async Task DemoSimpleCrawler() { var config = new CrawlConfiguration { MaxPagesToCrawl = 100, //Only crawl 100 pages MinCrawlDelayPerDomainMilliSeconds = 3000, // Wait this many milliseconds between requests LoginUser = "******", // LoginPassword = "******", IsUriRecrawlingEnabled = true, UseDefaultCredentials = true }; var crawler = new PoliteWebCrawler(config); crawler.PageCrawlCompleted += PageCrawlCompleted; // Several events available var crawlResult = await crawler.CrawlAsync(new Uri("https://www.udemy.com/")); }
static async Task Main(string[] args) { var config = new CrawlConfiguration { MaxPagesToCrawl = 100_000, HttpRequestTimeoutInSeconds = 60, }; var crawler = new PoliteWebCrawler(config, new KwestiaSmakuCrawlDecisionMaker(), null, null, null, null, null, null, null); crawler.PageCrawlCompleted += PageCrawlCompleted; var crawlResult = await crawler.CrawlAsync(new Uri("https://www.kwestiasmaku.com/")); await using (var file = File.OpenWrite("result.txt")) { var jsonResult = JsonConvert.SerializeObject(_recipeLinks); await file.WriteAsync(Encoding.UTF8.GetBytes(jsonResult)); } }
public async Task Crawl_CrawlTimeoutIs1Sec_TimesOut() { var configuration = new CrawlConfiguration(); configuration.CrawlTimeoutSeconds = 2; var pagesCrawledCount = 0; var crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null); crawler.PageCrawlCompleted += (a, b) => pagesCrawledCount++; var result = await crawler.CrawlAsync(new Uri("http://localhost.fiddler:1111/")); Assert.IsFalse(result.ErrorOccurred); Assert.IsTrue(result.Elapsed.TotalSeconds < 8, "Took more than 8 seconds"); Assert.IsTrue(pagesCrawledCount < 2, "Crawled more than 2 pages"); }
public async Task Crawl_MaxPagesTo5_OnlyCrawls5Pages() { var configuration = new CrawlConfiguration { IsExternalPageCrawlingEnabled = true, MaxPagesToCrawl = 5 }; var pagesCrawledCount = 0; var crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null); crawler.PageCrawlCompleted += (a, b) => pagesCrawledCount++; await crawler.CrawlAsync(new Uri("http://localhost.fiddler:1111/")); Assert.AreEqual(5, pagesCrawledCount); }
public async Task Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_UsesCorrectUserAgentString() { Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); CrawledPage homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; CrawledPage page1 = new CrawledPage(uri1); CrawledPage page2 = new CrawledPage(uri2); List <Uri> links = new List <Uri> { uri1, uri2 }; _fakeHttpRequester.Setup(f => f.MakeRequestAsync(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(homePage)); _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page1)); _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page2)); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(0); _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(true); _fakeRobotsDotTextFinder.Setup(f => f.FindAsync(It.IsAny <Uri>())).Returns(Task.FromResult(_fakeRobotsDotText.Object)); _dummyConfiguration.IsRespectRobotsDotTextEnabled = true; _dummyConfiguration.RobotsDotTextUserAgentString = "abcd"; _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); await _unitUnderTest.CrawlAsync(_rootUri); _fakeRobotsDotText.Verify(f => f.GetCrawlDelay(_dummyConfiguration.RobotsDotTextUserAgentString), Times.Exactly(1)); _fakeRobotsDotText.Verify(f => f.IsUrlAllowed(uri1.AbsoluteUri, _dummyConfiguration.RobotsDotTextUserAgentString), Times.Exactly(1)); _fakeRobotsDotText.Verify(f => f.IsUrlAllowed(uri1.AbsoluteUri, _dummyConfiguration.RobotsDotTextUserAgentString), Times.Exactly(1)); }
private static async Task DemoSimpleCrawler <T>(IParserSettings parserSettings, IParser <T> parser) where T : class { var config = new CrawlConfiguration { MaxPagesToCrawl = 20, //Only crawl 50 pages MinCrawlDelayPerDomainMilliSeconds = 1000, //Wait this many millisecs between requests }; var crawler = new PoliteWebCrawler(config); crawler.PageCrawlCompleted += PageCrawlCompleted; // event //crawler.ShouldCrawlPageDecisionMaker = CrawlPage; // delegate crawler.CrawlBag.Parser = parser; crawler.CrawlBag.Settings = parserSettings; var crawlResult = await crawler.CrawlAsync(new Uri(parserSettings.BaseUrl)); }
public async Task Crawl_Synchronous_CancellationTokenCancelled_StopsCrawl() { var cancellationTokenSource = new CancellationTokenSource(); var timer = new System.Timers.Timer(800); timer.Elapsed += (o, e) => { cancellationTokenSource.Cancel(); timer.Stop(); timer.Dispose(); }; timer.Start(); var crawler = new PoliteWebCrawler(); var result = await crawler.CrawlAsync(new Uri("http://localhost.fiddler:1111/"), cancellationTokenSource); Assert.IsTrue(result.ErrorOccurred); Assert.IsTrue(result.ErrorException is OperationCanceledException); }
private static async Task DoCrawl(Smithy smithy) { var config = new CrawlConfiguration(); if (!string.IsNullOrEmpty(smithy.User) && !string.IsNullOrEmpty(smithy.Pass)) { config.MaxConcurrentThreads = smithy.Threads; config.MaxCrawlDepth = smithy.Depth; config.MinCrawlDelayPerDomainMilliSeconds = smithy.Delay; config.MaxPagesToCrawl = smithy.MaxPages; config.MaxRetryCount = 1; //HttpServicePointConnectionLimit = 2000, config.HttpRequestTimeoutInSeconds = smithy.Timeout; config.LoginUser = smithy.User; config.LoginPassword = smithy.Pass; } if (!string.IsNullOrEmpty(smithy.User) || !string.IsNullOrEmpty(smithy.Pass)) { if (string.IsNullOrEmpty(smithy.Pass) || string.IsNullOrEmpty(smithy.User)) { Console.WriteLine("Please specify both a username and a password if using basic auth"); System.Environment.Exit(1); } } else { config.MaxConcurrentThreads = smithy.Threads; config.MaxCrawlDepth = smithy.Depth; config.MinCrawlDelayPerDomainMilliSeconds = smithy.Delay; config.MaxPagesToCrawl = smithy.MaxPages; config.MaxRetryCount = 1; //HttpServicePointConnectionLimit = 2000, config.HttpRequestTimeoutInSeconds = smithy.Timeout; } var crawler = new PoliteWebCrawler(config); crawler.PageCrawlCompleted += PageCrawlCompleted; var crawlResult = await crawler.CrawlAsync(new Uri(smithy.Url)); }
public async Task <List <CrawlerPage> > CollectLinks(string URL, IEnumerable <ICrawlerFilter> IncludeFilters = null, IEnumerable <ICrawlerFilter> ExcludeFilters = null) { var result = new List <CrawlerPage>(); var crawler = new PoliteWebCrawler(AbotProvider.DefaultConfig); var useIncludeFilters = IncludeFilters != null && IncludeFilters.Count() > 0; var useExcludeFilters = ExcludeFilters != null && ExcludeFilters.Count() > 0; crawler.PageCrawlCompleted += (s, e) => { var currentURL = e.CrawledPage.Uri.GetLeftPart(UriPartial.Path); // remove query-params Debug.WriteLine($"Page {e.CrawledPage.Uri}, Depth: {e.CrawledPage.CrawlDepth} -> {e.CrawledPage.ParsedLinks?.Count()} Links"); var match = IncludeFilters?.FirstOrDefault(f => f.Execute(currentURL)); if (useIncludeFilters && match == null) { Debug.WriteLine("skipped by Include filter"); return; } match = ExcludeFilters?.FirstOrDefault(f => f.Execute(currentURL)); if (useExcludeFilters && match != null) { Debug.WriteLine("skipped by Exclude filter"); return; } result.Add(new CrawlerPage { URL = currentURL, ContentType = (match is CrawlerVideoFilter) ? eCrawlerContentType.VIDEO : eCrawlerContentType.CONTENT, Depth = e.CrawledPage.CrawlDepth, Links = e.CrawledPage.ParsedLinks?.Select(x => x.HrefValue.ToString()).ToList() }); }; var crawlSummary = await crawler.CrawlAsync(new Uri(URL)); Debug.WriteLine($"[Completed] {crawlSummary.CrawlContext.CrawledCount}"); return(result); }
public async Task Crawl_MaxPagesTo5_WithCrawlDelay_OnlyCrawls5Pages() { await new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequestAsync(new Uri("http://localhost:1111/PageGenerator/ClearCounters")); CrawlConfiguration configuration = new CrawlConfiguration(); configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; //adding delay since it increases the chance of issues with abot crawling more than MaxPagesToCrawl. configuration.MaxPagesToCrawl = 5; int pagesCrawledCount = 0; PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null); crawler.PageCrawlCompleted += (a, b) => pagesCrawledCount++; await crawler.CrawlAsync(new Uri("http://localhost:1111/")); Assert.AreEqual(5, pagesCrawledCount); }
public async Task Crawl_IsRateLimited() { var configuration = new CrawlConfiguration(); configuration.MaxPagesToCrawl = 3; configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; // 1 second * 2 pages = 2 (or more) seconds var pagesCrawledCount = 0; var crawler = new PoliteWebCrawler(configuration); crawler.PageCrawlCompleted += (a, b) => pagesCrawledCount++; var uriToCrawl = new Uri("http://localhost.fiddler:1111/"); var start = DateTime.Now; await crawler.CrawlAsync(uriToCrawl); var elapsed = DateTime.Now - start; Assert.IsTrue(elapsed.TotalMilliseconds > 2000); Assert.AreEqual(3, pagesCrawledCount); }