private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler() { IWebCrawler crawler = new PoliteWebCrawler(); crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { return(new CrawlDecision { Allow = (pageToCrawl.Uri.AbsoluteUri.StartsWith("https://home.treasury.gov") || pageToCrawl.Uri.AbsoluteUri.StartsWith("https://www.treasury.gov")) && !pageToCrawl.Uri.AbsoluteUri.EndsWith(".pdf") }); }); crawler.ShouldDownloadPageContent((crawledPage, crawlContext) => { return(new CrawlDecision { Allow = true }); }); //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri. //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) => { return(new CrawlDecision { Allow = true }); }); return(crawler); }
public PoliteWebCrawler CreateCrawler() { _dataFinder = new DataFinder(new KomputronikDataExtractor()); XmlConfigurator.Configure(); CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); crawlConfig.MaxConcurrentThreads = 15;//this overrides the config value crawlConfig.MaxCrawlDepth = 15; //Will use app.config for configuration PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.ShouldCrawlPage(ShouldCrawlPage); crawler.ShouldDownloadPageContent(ShouldCrawlPageContent); crawler.ShouldCrawlPageLinks(ShouldCrawlPageLinks); crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; return(crawler); }