private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler() { IWebCrawler crawler = new PoliteWebCrawler(); crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { return(new CrawlDecision { Allow = (pageToCrawl.Uri.AbsoluteUri.StartsWith("https://home.treasury.gov") || pageToCrawl.Uri.AbsoluteUri.StartsWith("https://www.treasury.gov")) && !pageToCrawl.Uri.AbsoluteUri.EndsWith(".pdf") }); }); crawler.ShouldDownloadPageContent((crawledPage, crawlContext) => { return(new CrawlDecision { Allow = true }); }); //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri. //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) => { return(new CrawlDecision { Allow = true }); }); return(crawler); }
private void doWork(object sender, DoWorkEventArgs e) { CrawlConfiguration crawlConfig = new CrawlConfiguration(); crawlConfig.MaxConcurrentThreads = AppSettings.Settings.Crawler.MaxConcurrentThreads; crawlConfig.MaxPagesToCrawl = AppSettings.Settings.Crawler.MaxPagesToCrawl; crawlConfig.UserAgentString = AppSettings.Settings.Crawler.UserAgentString; crawlConfig.HttpRequestMaxAutoRedirects = AppSettings.Settings.Crawler.HttpRequestMaxAutoRedirects; crawlConfig.MaxCrawlDepth = AppSettings.Settings.Crawler.MaxCrawlDepth; PoliteWebCrawler crawler = new PoliteWebCrawler(crawlConfig, null, null, null, new SitePageRequester(crawlConfig), null, null, null, null); crawler.PageCrawlStartingAsync += crawler_PageCrawlStartingAsync; crawler.PageCrawlCompletedAsync += crawler_PageCrawlCompletedAsync; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowedAsync; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowedAsync; crawler.ShouldCrawlPage(crawler_ShouldCrawlPage); m_globalWordsCounter = 0; updateCrawlingStarted(); Uri url = new Uri(siteURL.Text); CrawlResult crawlerResult = crawler.Crawl(url); if (m_backgroundWorker.CancellationPending) { e.Cancel = true; m_backgroundWorker.ReportProgress(0); } string elapsedTimeStr = crawlerResult.Elapsed.ToString(@"dd\.hh\:mm\:ss"); log.Debug("*** Crawling completed. Elapsed time: " + elapsedTimeStr); log.Debug("*** Total words found in this site: " + m_globalWordsCounter.ToString()); // create CSV report createCSVFile(m_globalWordsCounter); updateCrawlingFinished(); done(); }
private PoliteWebCrawler SetUp() { XmlConfigurator.Configure(); CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); crawlConfig.MaxConcurrentThreads = 5;//this overrides the config value // Ostrożnie z głębokością, 0 i tak sporo zwraca rekordów crawlConfig.MaxCrawlDepth = 0; //Will use app.config for configuration PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.ShouldCrawlPage(ShouldCrawlPage); crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; return(crawler); }
public PoliteWebCrawler CreateCrawler() { _dataFinder = new DataFinder(new KomputronikDataExtractor()); XmlConfigurator.Configure(); CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); crawlConfig.MaxConcurrentThreads = 15;//this overrides the config value crawlConfig.MaxCrawlDepth = 15; //Will use app.config for configuration PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.ShouldCrawlPage(ShouldCrawlPage); crawler.ShouldDownloadPageContent(ShouldCrawlPageContent); crawler.ShouldCrawlPageLinks(ShouldCrawlPageLinks); crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; return(crawler); }