private void ConfigureWebCrawler() { CrawlConfiguration config = new CrawlConfiguration(); config.MaxConcurrentThreads = Environment.ProcessorCount; config.MaxPagesToCrawl = 0; config.MaxPagesToCrawlPerDomain = 0; config.MaxPageSizeInBytes = 0; config.UserAgentString = "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko"; config.HttpProtocolVersion = HttpProtocolVersion.NotSpecified; config.CrawlTimeoutSeconds = 0; config.IsUriRecrawlingEnabled = false; config.IsExternalPageCrawlingEnabled = false; config.IsExternalPageLinksCrawlingEnabled = false; config.IsRespectUrlNamedAnchorOrHashbangEnabled = false; config.DownloadableContentTypes = "text/html, text/plain"; config.HttpServicePointConnectionLimit = 200; config.HttpRequestTimeoutInSeconds = 15; config.HttpRequestMaxAutoRedirects = 7; config.IsHttpRequestAutoRedirectsEnabled = true; config.IsHttpRequestAutomaticDecompressionEnabled = true; config.IsSendingCookiesEnabled = false; config.IsSslCertificateValidationEnabled = false; config.MinAvailableMemoryRequiredInMb = 0; config.MaxMemoryUsageInMb = 0; config.MaxMemoryUsageCacheTimeInSeconds = 0; config.MaxCrawlDepth = 1000; config.MaxLinksPerPage = 1000; config.IsForcedLinkParsingEnabled = false; config.MaxRetryCount = 0; config.MinRetryDelayInMilliseconds = 0; config.IsRespectRobotsDotTextEnabled = true; config.UrlPatternsToExclude = ExtractorParams.UrlPatternsToExclude; config.IsRespectMetaRobotsNoFollowEnabled = true; config.IsRespectHttpXRobotsTagHeaderNoFollowEnabled = true; config.IsRespectAnchorRelNoFollowEnabled = true; config.IsIgnoreRobotsDotTextIfRootDisallowedEnabled = false; config.RobotsDotTextUserAgentString = "bingbot"; config.MinCrawlDelayPerDomainMilliSeconds = ExtractorParams.MinCrawlDelay; config.MaxRobotsDotTextCrawlDelayInSeconds = 5; config.IsAlwaysLogin = false; config.LoginUser = ""; config.LoginPassword = ""; config.UseDefaultCredentials = false; if (!DoContinue) { scheduler = new Scheduler(config.IsUriRecrawlingEnabled, null, null); } else { using (FileStream fs = new FileStream(Path.Combine(ContentDirectory.FullName, LogsDirName, CheckpointFileName), FileMode.Open)) { scheduler = Scheduler.Deserialize(fs); } } crawler = new PoliteWebCrawler(config, null, null, scheduler, null, null, null, null, null); crawler.IsInternalUri((candidateUri, rootUri) => HtmlFileUtils.ShouldCrawlUri(ExtractorParams.Scope, candidateUri, rootUri)); crawler.ShouldCrawlPageLinks(WebCrawler_ShouldCrawlPageLinks); crawler.PageCrawlCompletedAsync += WebCrawler_PageCrawlCompletedAsync; // DEBUG: uncomment to debug Abot crawl progress // crawler.PageCrawlStartingAsync += WebCrawler_PageCrawlStartingAsync; // DEBUG: uncomment to debug Abot crawling decisions // crawler.PageCrawlDisallowedAsync += WebCrawler_PageCrawlDisallowedAsync; // crawler.PageLinksCrawlDisallowedAsync += WebCrawler_PageLinksCrawlDisallowedAsync; }