public void GenStuff() { PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; //PageObjects are being created as they are asynchronously found during the crawl crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(new Uri(ConfigurationManager.AppSettings["HomePageURL"])); int count = result.CrawlContext.CrawledCount; Console.WriteLine(result.CrawlContext.ToJSON()); Console.WriteLine(result.ToJSON()); Console.WriteLine("Total Crawled Page Count = " + count); ////Parse txt file URLS and Get all page elements from each page and put into a dictionary //var xpathElements = CreateXpathsFromUrls(); ////Get all values that are the same //var sameValues = GetSameValues(xpathElements); ////get New Elements that do not exist on multiple pages //var newElements = GetNewElements(sameValues, xpathElements); Console.WriteLine("hello"); }
static void DoCrawl() { CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); crawlConfig.CrawlTimeoutSeconds = 100; crawlConfig.MaxConcurrentThreads = 10; crawlConfig.MaxPagesToCrawl = 5000; crawlConfig.UserAgentString = "abot v1.0 http://code.google.com/p/abot"; //crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue1", "1111"); //crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue2", "2222"); //Will use app.config for confguration PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(new Uri("http://sunnah.com/")); Console.WriteLine("jumlah crawled content :" + result.CrawlContext.CrawledCount); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } }
protected AbotRecipeExtractor(PoliteWebCrawler politeWebCrawler, IHtmlRecipeParser htmlRecipeParser, ILogger <AbotRecipeExtractor> logger = null) { _politeWebCrawler = politeWebCrawler; _htmlRecipeParser = htmlRecipeParser; Logger = logger; _politeWebCrawler.PageCrawlCompleted += OnPageCrawlCompleted; }
public WebSpider() { _crawler = new PoliteWebCrawler(); _crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; _crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; _crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; _crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; _crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { CrawlDecision decision = new CrawlDecision { Allow = true }; var isCrawlDepth1 = pageToCrawl.CrawlDepth == 0 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/s?wd"); var isCrawlDepth2 = pageToCrawl.CrawlDepth == 1 && !pageToCrawl.Uri.AbsoluteUri.Contains("www.baidu.com/link"); if (isCrawlDepth1 || isCrawlDepth2) { return new CrawlDecision { Allow = false, Reason = "Dont want to crawl google pages" } } ; return(decision); }); }
private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler() { IWebCrawler crawler = new PoliteWebCrawler(); crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { return(new CrawlDecision { Allow = (pageToCrawl.Uri.AbsoluteUri.StartsWith("https://home.treasury.gov") || pageToCrawl.Uri.AbsoluteUri.StartsWith("https://www.treasury.gov")) && !pageToCrawl.Uri.AbsoluteUri.EndsWith(".pdf") }); }); crawler.ShouldDownloadPageContent((crawledPage, crawlContext) => { return(new CrawlDecision { Allow = true }); }); //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri. //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) => { return(new CrawlDecision { Allow = true }); }); return(crawler); }
public void Crawl_MinCrawlDelayDelayZero_DomainRateLimiterNotCalled() { Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" }; CrawledPage page1 = new CrawledPage(uri1); CrawledPage page2 = new CrawledPage(uri2); List <Uri> links = new List <Uri> { uri1, uri2 }; _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(homePage); _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page1); _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page2); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Never()); }
public void Crawl_MinCrawlDelayGreaterThanZero_CallsDomainRateLimiter() { Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); CrawledPage homePage = new CrawledPage(_rootUri) { RawContent = "content here" }; CrawledPage page1 = new CrawledPage(uri1); CrawledPage page2 = new CrawledPage(uri2); List <Uri> links = new List <Uri> { uri1, uri2 }; _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(homePage); _fakeHttpRequester.Setup(f => f.MakeRequest(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page1); _fakeHttpRequester.Setup(f => f.MakeRequest(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page2); _fakeHyperLinkParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _dummyConfiguration.MinCrawlDelayPerDomainMilliSeconds = 1;//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED }
private static void Main(string[] args) { try { Uri uriToCrawl = GetSiteToCrawl(); // I'm using the default crawler var crawler = new PoliteWebCrawler(); // I need to subscribe to this event in order to process pages that have been crawled crawler.PageCrawlCompletedAsync += ProcessPageCrawlCompleted; // Start the crawl CrawlResult crawlResult = crawler.Crawl(uriToCrawl); // Generate report Task <ReportResult> reportTask = GenerateReport(); PrintResultInformation(reportTask.Result); } catch (Exception ex) { System.Console.ForegroundColor = ConsoleColor.Red; System.Console.WriteLine("There was an error when trying to crawl page."); System.Console.Write(ex); System.Console.ReadKey(); } }
static void Main(string[] args) { CrawlConfiguration crawlConfig = new CrawlConfiguration(); crawlConfig.CrawlTimeoutSeconds = 100; crawlConfig.MaxConcurrentThreads = 1; crawlConfig.MaxPagesToCrawl = 1; PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; //crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; //crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(new Uri("http://www.kmhk.kmu.edu.tw/news/list.asp?P_classify=9")); //This is synchronous, it will not go to the next line until the crawl has completed if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } }
/// <summary> /// 运行爬虫 /// </summary> public void StartCrawl() { //设置爬虫 PoliteWebCrawler crawler = new PoliteWebCrawler(); //设置爬取条件 crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; //开始爬取 CrawlResult result = crawler.Crawl(new Uri(link)); //This is synchronous, it will not go to the next line until the crawl has completed //返回结果 if (result.ErrorOccurred) { log.Error("链接" + result.RootUri.AbsoluteUri + "出现差错爬取完成:" + result.ErrorException.Message); Console.WriteLine("链接 {0} 出现差错爬取完成: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { log.Info("链接" + result.RootUri.AbsoluteUri + "无差错爬取完成!"); Console.WriteLine("链接 {0} 无差错爬取完成.", result.RootUri.AbsoluteUri); } flag = false; }
public WebsiteIndexer(string host, ICollection <string> ignoredPathes = null, int delayPerRequestMilliSeconds = 1000, int maxPagesToCrawl = 1000) { _host = host; var config = new CrawlConfiguration { MaxPagesToCrawl = maxPagesToCrawl, MinCrawlDelayPerDomainMilliSeconds = delayPerRequestMilliSeconds, IsExternalPageCrawlingEnabled = false }; Crawler = new PoliteWebCrawler(config) { ShouldCrawlPageDecisionMaker = (pageToCrawl, crawlContext) => { var ignored = string.IsNullOrEmpty(pageToCrawl.Uri?.AbsolutePath) || ignoredPathes?.Any(p => Regex.IsMatch(pageToCrawl.Uri.AbsolutePath, p)) == true; if (ignored) { Console.WriteLine($"Ignored '{pageToCrawl.Uri?.AbsolutePath}'"); return(new CrawlDecision { Allow = false, Reason = "Path matches pattern in blacklist" }); } return(new CrawlDecision { Allow = true }); } }; Crawler.PageCrawlCompleted += PageCrawlCompleted; }
public static void StartCrawlEbuyer(string url) { try { PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; TimeSpan ts = new TimeSpan(0, 0, 5); CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(ts); CrawlResult result = crawler.Crawl(new Uri(url), cancellationTokenSource); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } }catch (Exception) { } ExtractingHtml.ExtractDetailsEbuyer(); }
public void StartCrawl(string[] pages) { CrawlConfiguration(); PoliteWebCrawler crawler = new PoliteWebCrawler(_crawlConfiguration); crawler.PageCrawlStartingAsync += Crawler_PageCrawlStartingAsync; crawler.PageCrawlCompletedAsync += Crawler_PageCrawlCompletedAsync; crawler.PageCrawlDisallowedAsync += Crawler_PageCrawlDisallowedAsync; crawler.PageLinksCrawlDisallowedAsync += Crawler_PageLinksCrawlDisallowedAsync; foreach (var page in pages) { _parser.IdentifyParser(page); var result = crawler.Crawl(new Uri(page)); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } _parser.Save(); } }
public async Task Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_PageIsDisallowed_DoesNotCallHttpRequester() { var homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(0); _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(false); _fakeRobotsDotTextFinder.Setup(f => f.FindAsync(It.IsAny <Uri>())).Returns(Task.FromResult(_fakeRobotsDotText.Object)); _dummyConfiguration.IsRespectRobotsDotTextEnabled = true; _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHtmlParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); await _unitUnderTest.CrawlAsync(_rootUri); _fakeHttpRequester.Setup(f => f.MakeRequestAsync(It.IsAny <Uri>(), It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(homePage)); _fakeRobotsDotText.VerifyAll(); _fakeRobotsDotTextFinder.VerifyAll(); _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny <Uri>(), It.IsAny <long>()), Times.Exactly(0)); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(0)); }
public void Crawl(CrawlRequest request) { CrawlConfiguration crawlConfig = new CrawlConfiguration(); crawlConfig.CrawlTimeoutSeconds = 100; crawlConfig.MaxConcurrentThreads = 10; crawlConfig.MaxPagesToCrawl = 1000; crawlConfig.UserAgentString = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; abot v1.0 http://code.google.com/p/abot)"; crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue1", "1111"); crawlConfig.ConfigurationExtensions.Add("SomeCustomConfigValue2", "2222"); crawlConfig.MaxCrawlDepth = 10; crawlConfig.DownloadableContentTypes = "text/html, text/plain"; //Will use the manually created crawlConfig object created above PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, null, null, null, null, null, null, null, null); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; CrawlResult result = crawler.Crawl(new Uri(request.EntryURL)); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } }
public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_PageIsDisallowed_IsIgnoreRobotsDotTextIfRootDisallowedEnabledTrue_CallsHttpRequester() { CrawledPage homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; CrawledPage page1 = new CrawledPage(_rootUri); _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(false); _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny <Uri>())).Returns(_fakeRobotsDotText.Object); _fakeHttpRequester.Setup(f => f.MakeRequest(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(page1); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _dummyConfiguration.IsRespectRobotsDotTextEnabled = true; _dummyConfiguration.IsIgnoreRobotsDotTextIfRootDisallowedEnabled = true; _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeCrawlDecisionMaker.VerifyAll(); _fakeRobotsDotText.VerifyAll(); _fakeRobotsDotTextFinder.VerifyAll(); _fakeHttpRequester.VerifyAll(); }
public void Crawl_IsRateLimited() { new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequest(new Uri("http://localhost.fiddler:1111/PageGenerator/ClearCounters")); CrawlConfiguration configuration = new CrawlConfiguration(); configuration.MaxPagesToCrawl = 3; configuration.MinCrawlDelayPerDomainMilliSeconds = 1000; // 1 second * 2 pages = 2 (or more) seconds int pagesCrawledCount = 0; var crawler = new PoliteWebCrawler(configuration); crawler.PageCrawlCompletedAsync += (a, b) => pagesCrawledCount++; var uriToCrawl = new Uri("http://localhost.fiddler:1111/"); var start = DateTime.Now; crawler.Crawl(uriToCrawl); var elapsed = DateTime.Now - start; Assert.GreaterOrEqual(elapsed.TotalMilliseconds, 2000); Assert.AreEqual(3, pagesCrawledCount); }
static void Main(string[] args) { CrawlConfiguration config = new CrawlConfiguration(); config.MaxConcurrentThreads = 1; // Web Extractor is not currently thread-safe. // Create the PhantomJS instance. This will spawn a new PhantomJS process using phantomjs.exe. // Make sure to dispose this instance or you will have a zombie process! IWebDriver driver = CreatePhantomJsDriver(config); // Create the content extractor that uses PhantomJS. IWebContentExtractor extractor = new JavaScriptContentExtractor(driver); // Create a PageRequester that will use the extractor. IPageRequester requester = new PageRequester(config, extractor); using (IWebCrawler crawler = new PoliteWebCrawler(config, null, null, null, requester, null, null, null, null)) { crawler.PageCrawlCompleted += OnPageCrawlCompleted; CrawlResult result = crawler.Crawl(new Uri("http://wvtesting2.com/")); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } } Console.Read(); }
public static void Main(string[] args) { PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.PageCrawlCompletedAsync += Crawler_ProcessPageCrawlCompleted; var start = DateTime.Now; var uri = new Uri("https://lord.technology"); CrawlResult result = crawler.Crawl(uri); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } var finish = DateTime.Now; Console.WriteLine((finish - start).TotalMinutes); using (FileStream fs = File.Open(@"./crawl.json", FileMode.Create)) using (StreamWriter sw = new StreamWriter(fs)) using (JsonWriter jw = new JsonTextWriter(sw)) { jw.Formatting = Formatting.Indented; JsonSerializer serializer = new JsonSerializer(); serializer.Serialize(jw, new { nodes = _pages, edges = _relationships }); } }
public async Task Crawl_MaxPagesTo25_OnlyCrawls25Pages() { await new PageRequester(new CrawlConfiguration { UserAgentString = "aaa" }).MakeRequestAsync(new Uri("http://localhost:1111/PageGenerator/ClearCounters")); CrawlConfiguration configuration = new CrawlConfiguration(); configuration.MaxPagesToCrawl = 25; configuration.IsExternalPageCrawlingEnabled = true; configuration.IsExternalPageLinksCrawlingEnabled = true; int pagesCrawledCount = 0; PoliteWebCrawler crawler = new PoliteWebCrawler(configuration, null, null, null, null, null, null, null, null); crawler.PageCrawlCompleted += (a, b) => { pagesCrawledCount++; }; var res = await crawler.CrawlAsync(new Uri("http://localhost:1111/")); Assert.AreEqual(25, pagesCrawledCount); }
//Crawling code for GSM public static void StartCrawlGSM(string url) { PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStartingGSM; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompletedGSM; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowedGSM; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowedGSM; TimeSpan ts = new TimeSpan(0, 0, 0); CancellationTokenSource cancellationTokenSource = new CancellationTokenSource(ts); CrawlResult result = crawler.Crawl(new Uri(url), cancellationTokenSource); if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} completed with error: {1}", result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} completed without error.", result.RootUri.AbsoluteUri); } //FileStream fs = new FileStream("url.txt", FileMode.Open); //StreamReader sr = new StreamReader(fs); //string str = ""; //while ((str = sr.ReadLine()) != null) //{ // StartCrawl(str); //} ExtractingHtml.ExtractingDetailsGSM(); }
public async Task Crawl(string rootUri, int maxPages) { try { PoliteWebCrawler crawler = new PoliteWebCrawler(CreateCrawlConfiguration(maxPages), null, null, null, null, null, null, null, null); crawler.PageCrawlStartingAsync += crawler_ProcessPageCrawlStarting; crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; CrawlResult result = crawler.Crawl(new Uri(rootUri)); //This is synchronous, it will not go to the next line until the crawl has completed if (result.ErrorOccurred) { Console.WriteLine("Crawl of {0} ({1} pages) completed with error: {2}", result.RootUri.AbsoluteUri, PageCount, result.ErrorException.Message); } else { Console.WriteLine("Crawl of {0} ({1} pages) completed without error.", result.RootUri.AbsoluteUri, PageCount); } await _handler.CrawlFinishedAsync(); } catch (Exception e) { } }
public void Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_PageIsDisallowed_DoesNotCallHttpRequester() { Uri uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); Uri uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); CrawledPage homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; CrawledPage page1 = new CrawledPage(uri1); CrawledPage page2 = new CrawledPage(uri2); List <Uri> links = new List <Uri> { uri1, uri2 }; _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(0); _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(false); _fakeRobotsDotTextFinder.Setup(f => f.Find(It.IsAny <Uri>())).Returns(_fakeRobotsDotText.Object); _dummyConfiguration.IsRespectRobotsDotTextEnabled = true; _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHyperLinkParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); _unitUnderTest.Crawl(_rootUri); _fakeHttpRequester.Setup(f => f.MakeRequest(It.IsAny <Uri>(), It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(homePage); _fakeRobotsDotText.VerifyAll(); _fakeRobotsDotTextFinder.VerifyAll(); _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny <Uri>(), It.IsAny <long>()), Times.Exactly(0)); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(0)); }
public KwestiaSmakuRecipeExtractor( PoliteWebCrawler politeWebCrawler, IHtmlRecipeParser htmlRecipeParser, ILogger <AbotRecipeExtractor> logger = null) : base(politeWebCrawler, htmlRecipeParser, logger) { }
public void Test(Uri uri) { pageCount = 0; baseUri = uri; string message; CrawlConfiguration crawlConfiguration = new CrawlConfiguration(); crawlConfiguration.MaxConcurrentThreads = 4; crawlConfiguration.UserAgentString = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/60.0.3112.113 Safari/537.36 bot"; crawlConfiguration.MaxPagesToCrawl = 10000; crawlConfiguration.DownloadableContentTypes = "text/html, text/plain, image/jpeg, image/pjpeg, image/png"; crawlConfiguration.CrawlTimeoutSeconds = 100; crawlConfiguration.MinCrawlDelayPerDomainMilliSeconds = 1000; using PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfiguration); crawler.PageCrawlStarting += ProcessPageCrawlStarted; crawler.PageCrawlCompleted += ProcessPageCrawlCompleted; CrawlResult result = crawler.CrawlAsync(baseUri).Result; if (result.ErrorOccurred) { message = StringTable.GetString( "CRAWL_COMPLETE_ERROR", CultureInfo.InstalledUICulture); Log.InfoFormat( CultureInfo.InvariantCulture, message, result.RootUri.AbsoluteUri, result.ErrorException.Message); } else { message = StringTable.GetString( "CRAWL_COMPLETE_NO_ERROR", CultureInfo.InstalledUICulture); Log.InfoFormat( CultureInfo.InvariantCulture, message, result.RootUri.AbsoluteUri); } message = StringTable.GetString( "TOTAL_PAGES", CultureInfo.InstalledUICulture); Log.InfoFormat( CultureInfo.InvariantCulture, message, pageCount.ToString(CultureInfo.InvariantCulture)); }
private static async Task DemoSimpleCrawler() { var config = new CrawlConfiguration { UserAgentString = "2019RLCrawlAThon", MaxPagesToCrawl = 0, MinCrawlDelayPerDomainMilliSeconds = 10, }; var start = new Uri("https://thailand.kyocera.com/"); var crawler = new PoliteWebCrawler( config, new BetterDecisionMaker(start), null, new Scheduler(false, null, new PriorityUriRepository()), null, null, null, null, null); var files = new HashSet <string>(); var decMaker = new CrawlDecisionMaker(); var batch = new HashSet <string>(); crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted; crawler.PageCrawlCompleted += (sender, e) => { if (new[] { ".exe", ".zip", ".tar" }.Any(c => e.CrawledPage.Uri.AbsolutePath.Contains(c))) { lock (files) { Console.WriteLine("Found file: " + e.CrawledPage.Uri.Host + e.CrawledPage.Uri.LocalPath); Console.WriteLine(e.CrawledPage.CrawlDepth); if (!files.Contains(e.CrawledPage.Uri.ToString())) { files.Add(e.CrawledPage.Uri.ToString()); batch.Add(e.CrawledPage.Uri.ToString()); if (batch.Count >= 10) { using (var httpClient = new HttpClient()) { using (var request = new HttpRequestMessage(new HttpMethod("POST"), "http://hackathon.reversinglabs.com/api/test/bulk")) { var base64authorization = Convert.ToBase64String(Encoding.ASCII.GetBytes("tztok_jadnici:7@dQ6dqq7YZggcd")); request.Headers.TryAddWithoutValidation("Authorization", $"Basic {base64authorization}"); var body = "{\"crawlathon\": {\"query\": {\"site\": \"filehippo\", \"links\":[" + string.Join(", ", batch.Select(s => "\"" + s + "\"")) + "]}}}"; request.Content = new StringContent(body, Encoding.UTF8, "application/json"); var resp = httpClient.SendAsync(request).Result; batch.Clear(); } } } } } } }; var crawlResult = await crawler.CrawlAsync(start); }
public async Task Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_CrawlDelayAboveMinDomainCrawlDelay_CallsDomainRateLimiter() { var uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); var uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); var homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; var page1 = new CrawledPage(uri1); var page2 = new CrawledPage(uri2); var links = new List <HyperLink> { new HyperLink() { HrefValue = uri1 }, new HyperLink() { HrefValue = uri2 } }; _fakeHttpRequester.Setup(f => f.MakeRequestAsync(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(homePage)); _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page1)); _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page2)); _fakeHtmlParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldRecrawlPage(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = false }); _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(3);//this is more then the max configured crawl delay (should be ignored) _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(true); _fakeRobotsDotTextFinder.Setup(f => f.FindAsync(It.IsAny <Uri>())).Returns(Task.FromResult(_fakeRobotsDotText.Object)); _dummyConfiguration.IsRespectRobotsDotTextEnabled = true; //BY HAVING A THIS EQUAL TO TRUE WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED _dummyConfiguration.MaxRobotsDotTextCrawlDelayInSeconds = 2; //This is less than the crawl delay (Should Be used) _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHtmlParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); await _unitUnderTest.CrawlAsync(_rootUri); _fakeHttpRequester.VerifyAll(); _fakeHtmlParser.VerifyAll(); _fakeRobotsDotText.VerifyAll(); _fakeRobotsDotTextFinder.VerifyAll(); _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny <Uri>(), 2000), Times.Exactly(1)); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(3));//BY HAVING A CRAWL DELAY ABOVE ZERO WE EXPECT THE IDOMAINRATELIMITER TO BE CALLED }
static void Main(string[] args) { SiteMapFinder finder = new SiteMapFinder(); PoliteWebCrawler crawler = new PoliteWebCrawler(null, null, null, null, null, finder, null, null, null); crawler.PageCrawlCompleted += Crawler_PageCrawlCompleted; CrawlResult result = crawler.Crawl(new Uri("http://tenders.rfpalertservices.com/sitemap/")); }
public async Task Crawl_IsRespectRobotsDotTextTrue_RobotsDotTextFound_ZeroCrawlDelay_StillCallsDomainRateLimiter() { var uri1 = new Uri(_rootUri.AbsoluteUri + "a.html"); var uri2 = new Uri(_rootUri.AbsoluteUri + "b.html"); var homePage = new CrawledPage(_rootUri) { Content = new PageContent { Text = "content here" } }; var page1 = new CrawledPage(uri1); var page2 = new CrawledPage(uri2); var links = new List <HyperLink> { new HyperLink() { HrefValue = uri1 }, new HyperLink() { HrefValue = uri2 } }; _fakeHttpRequester.Setup(f => f.MakeRequestAsync(_rootUri, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(homePage)); _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri1, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page1)); _fakeHttpRequester.Setup(f => f.MakeRequestAsync(uri2, It.IsAny <Func <CrawledPage, CrawlDecision> >())).Returns(Task.FromResult(page2)); _fakeHtmlParser.Setup(f => f.GetLinks(It.Is <CrawledPage>(p => p.Uri == homePage.Uri))).Returns(links); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPage(It.IsAny <PageToCrawl>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldCrawlPageLinks(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = true }); _fakeCrawlDecisionMaker.Setup(f => f.ShouldRecrawlPage(It.IsAny <CrawledPage>(), It.IsAny <CrawlContext>())).Returns(new CrawlDecision { Allow = false }); _fakeRobotsDotText.Setup(f => f.GetCrawlDelay(It.IsAny <string>())).Returns(0); _fakeRobotsDotText.Setup(f => f.IsUrlAllowed(It.IsAny <string>(), It.IsAny <string>())).Returns(true); _fakeRobotsDotTextFinder.Setup(f => f.FindAsync(It.IsAny <Uri>())).Returns(Task.FromResult(_fakeRobotsDotText.Object)); _dummyConfiguration.IsRespectRobotsDotTextEnabled = true; _unitUnderTest = new PoliteWebCrawler(_dummyConfiguration, _fakeCrawlDecisionMaker.Object, _dummyThreadManager, _dummyScheduler, _fakeHttpRequester.Object, _fakeHtmlParser.Object, _fakeMemoryManager.Object, _fakeDomainRateLimiter.Object, _fakeRobotsDotTextFinder.Object); await _unitUnderTest.CrawlAsync(_rootUri); _fakeHttpRequester.VerifyAll(); _fakeHtmlParser.VerifyAll(); _fakeRobotsDotText.VerifyAll(); _fakeRobotsDotTextFinder.VerifyAll(); _fakeDomainRateLimiter.Verify(f => f.AddDomain(It.IsAny <Uri>(), It.IsAny <long>()), Times.Exactly(0)); _fakeDomainRateLimiter.Verify(f => f.RateLimit(It.IsAny <Uri>()), Times.Exactly(3)); }
public void Constructor_ZeroMinCrawlDelay_DoesNotThrowExceptionCreatingAnIDomainRateLimiterWithLessThan1Millisec() { using (var unused = new PoliteWebCrawler(new CrawlConfiguration { MinCrawlDelayPerDomainMilliSeconds = 0 }, null, null, null, null, null, null, null, null)) { } }