private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler() { IWebCrawler crawler = new PoliteWebCrawler(); crawler.ShouldCrawlPage((pageToCrawl, crawlContext) => { return(new CrawlDecision { Allow = (pageToCrawl.Uri.AbsoluteUri.StartsWith("https://home.treasury.gov") || pageToCrawl.Uri.AbsoluteUri.StartsWith("https://www.treasury.gov")) && !pageToCrawl.Uri.AbsoluteUri.EndsWith(".pdf") }); }); crawler.ShouldDownloadPageContent((crawledPage, crawlContext) => { return(new CrawlDecision { Allow = true }); }); //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri. //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) => { return(new CrawlDecision { Allow = true }); }); return(crawler); }
public PoliteWebCrawler CreateCrawler() { _dataFinder = new DataFinder(new KomputronikDataExtractor()); XmlConfigurator.Configure(); CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert(); crawlConfig.MaxConcurrentThreads = 15;//this overrides the config value crawlConfig.MaxCrawlDepth = 15; //Will use app.config for configuration PoliteWebCrawler crawler = new PoliteWebCrawler(); crawler.ShouldCrawlPage(ShouldCrawlPage); crawler.ShouldDownloadPageContent(ShouldCrawlPageContent); crawler.ShouldCrawlPageLinks(ShouldCrawlPageLinks); crawler.PageCrawlCompletedAsync += crawler_ProcessPageCrawlCompleted; crawler.PageCrawlDisallowedAsync += crawler_PageCrawlDisallowed; crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed; return(crawler); }
private void ConfigureWebCrawler() { CrawlConfiguration config = new CrawlConfiguration(); config.MaxConcurrentThreads = Environment.ProcessorCount; config.MaxPagesToCrawl = 0; config.MaxPagesToCrawlPerDomain = 0; config.MaxPageSizeInBytes = 0; config.UserAgentString = "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko"; config.HttpProtocolVersion = HttpProtocolVersion.NotSpecified; config.CrawlTimeoutSeconds = 0; config.IsUriRecrawlingEnabled = false; config.IsExternalPageCrawlingEnabled = false; config.IsExternalPageLinksCrawlingEnabled = false; config.IsRespectUrlNamedAnchorOrHashbangEnabled = false; config.DownloadableContentTypes = "text/html, text/plain"; config.HttpServicePointConnectionLimit = 200; config.HttpRequestTimeoutInSeconds = 15; config.HttpRequestMaxAutoRedirects = 7; config.IsHttpRequestAutoRedirectsEnabled = true; config.IsHttpRequestAutomaticDecompressionEnabled = true; config.IsSendingCookiesEnabled = false; config.IsSslCertificateValidationEnabled = false; config.MinAvailableMemoryRequiredInMb = 0; config.MaxMemoryUsageInMb = 0; config.MaxMemoryUsageCacheTimeInSeconds = 0; config.MaxCrawlDepth = 1000; config.MaxLinksPerPage = 1000; config.IsForcedLinkParsingEnabled = false; config.MaxRetryCount = 0; config.MinRetryDelayInMilliseconds = 0; config.IsRespectRobotsDotTextEnabled = true; config.UrlPatternsToExclude = ExtractorParams.UrlPatternsToExclude; config.IsRespectMetaRobotsNoFollowEnabled = true; config.IsRespectHttpXRobotsTagHeaderNoFollowEnabled = true; config.IsRespectAnchorRelNoFollowEnabled = true; config.IsIgnoreRobotsDotTextIfRootDisallowedEnabled = false; config.RobotsDotTextUserAgentString = "bingbot"; config.MinCrawlDelayPerDomainMilliSeconds = ExtractorParams.MinCrawlDelay; config.MaxRobotsDotTextCrawlDelayInSeconds = 5; config.IsAlwaysLogin = false; config.LoginUser = ""; config.LoginPassword = ""; config.UseDefaultCredentials = false; if (!DoContinue) { scheduler = new Scheduler(config.IsUriRecrawlingEnabled, null, null); } else { using (FileStream fs = new FileStream(Path.Combine(ContentDirectory.FullName, LogsDirName, CheckpointFileName), FileMode.Open)) { scheduler = Scheduler.Deserialize(fs); } } crawler = new PoliteWebCrawler(config, null, null, scheduler, null, null, null, null, null); crawler.IsInternalUri((candidateUri, rootUri) => HtmlFileUtils.ShouldCrawlUri(ExtractorParams.Scope, candidateUri, rootUri)); crawler.ShouldCrawlPageLinks(WebCrawler_ShouldCrawlPageLinks); crawler.PageCrawlCompletedAsync += WebCrawler_PageCrawlCompletedAsync; // DEBUG: uncomment to debug Abot crawl progress // crawler.PageCrawlStartingAsync += WebCrawler_PageCrawlStartingAsync; // DEBUG: uncomment to debug Abot crawling decisions // crawler.PageCrawlDisallowedAsync += WebCrawler_PageCrawlDisallowedAsync; // crawler.PageLinksCrawlDisallowedAsync += WebCrawler_PageLinksCrawlDisallowedAsync; }