示例#1
0
        private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler()
        {
            IWebCrawler crawler = new PoliteWebCrawler();

            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                return(new CrawlDecision {
                    Allow =
                        (pageToCrawl.Uri.AbsoluteUri.StartsWith("https://home.treasury.gov") ||
                         pageToCrawl.Uri.AbsoluteUri.StartsWith("https://www.treasury.gov")) &&
                        !pageToCrawl.Uri.AbsoluteUri.EndsWith(".pdf")
                });
            });


            crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>
            {
                return(new CrawlDecision {
                    Allow = true
                });
            });

            //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri.
            //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run
            crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
            {
                return(new CrawlDecision {
                    Allow = true
                });
            });

            return(crawler);
        }
示例#2
0
        public PoliteWebCrawler CreateCrawler()
        {
            _dataFinder = new DataFinder(new KomputronikDataExtractor());

            XmlConfigurator.Configure();

            CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();

            crawlConfig.MaxConcurrentThreads = 15;//this overrides the config value

            crawlConfig.MaxCrawlDepth = 15;

            //Will use app.config for configuration
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            crawler.ShouldCrawlPage(ShouldCrawlPage);
            crawler.ShouldDownloadPageContent(ShouldCrawlPageContent);
            crawler.ShouldCrawlPageLinks(ShouldCrawlPageLinks);

            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
            return(crawler);
        }