Exemplo n.º 1
0
        private void ConfigureWebCrawler()
        {
            CrawlConfiguration config = new CrawlConfiguration();

            config.MaxConcurrentThreads                       = Environment.ProcessorCount;
            config.MaxPagesToCrawl                            = 0;
            config.MaxPagesToCrawlPerDomain                   = 0;
            config.MaxPageSizeInBytes                         = 0;
            config.UserAgentString                            = "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko";
            config.HttpProtocolVersion                        = HttpProtocolVersion.NotSpecified;
            config.CrawlTimeoutSeconds                        = 0;
            config.IsUriRecrawlingEnabled                     = false;
            config.IsExternalPageCrawlingEnabled              = false;
            config.IsExternalPageLinksCrawlingEnabled         = false;
            config.IsRespectUrlNamedAnchorOrHashbangEnabled   = false;
            config.DownloadableContentTypes                   = "text/html, text/plain";
            config.HttpServicePointConnectionLimit            = 200;
            config.HttpRequestTimeoutInSeconds                = 15;
            config.HttpRequestMaxAutoRedirects                = 7;
            config.IsHttpRequestAutoRedirectsEnabled          = true;
            config.IsHttpRequestAutomaticDecompressionEnabled = true;
            config.IsSendingCookiesEnabled                    = false;
            config.IsSslCertificateValidationEnabled          = false;
            config.MinAvailableMemoryRequiredInMb             = 0;
            config.MaxMemoryUsageInMb                         = 0;
            config.MaxMemoryUsageCacheTimeInSeconds           = 0;
            config.MaxCrawlDepth               = 1000;
            config.MaxLinksPerPage             = 1000;
            config.IsForcedLinkParsingEnabled  = false;
            config.MaxRetryCount               = 0;
            config.MinRetryDelayInMilliseconds = 0;

            config.IsRespectRobotsDotTextEnabled                = true;
            config.UrlPatternsToExclude                         = ExtractorParams.UrlPatternsToExclude;
            config.IsRespectMetaRobotsNoFollowEnabled           = true;
            config.IsRespectHttpXRobotsTagHeaderNoFollowEnabled = true;
            config.IsRespectAnchorRelNoFollowEnabled            = true;
            config.IsIgnoreRobotsDotTextIfRootDisallowedEnabled = false;
            config.RobotsDotTextUserAgentString                 = "bingbot";
            config.MinCrawlDelayPerDomainMilliSeconds           = ExtractorParams.MinCrawlDelay;
            config.MaxRobotsDotTextCrawlDelayInSeconds          = 5;

            config.IsAlwaysLogin         = false;
            config.LoginUser             = "";
            config.LoginPassword         = "";
            config.UseDefaultCredentials = false;

            if (!DoContinue)
            {
                scheduler = new Scheduler(config.IsUriRecrawlingEnabled, null, null);
            }
            else
            {
                using (FileStream fs = new FileStream(Path.Combine(ContentDirectory.FullName, LogsDirName, CheckpointFileName), FileMode.Open))
                {
                    scheduler = Scheduler.Deserialize(fs);
                }
            }
            crawler = new PoliteWebCrawler(config, null, null, scheduler, null, null, null, null, null);
            crawler.IsInternalUri((candidateUri, rootUri) => HtmlFileUtils.ShouldCrawlUri(ExtractorParams.Scope, candidateUri, rootUri));
            crawler.ShouldCrawlPageLinks(WebCrawler_ShouldCrawlPageLinks);
            crawler.PageCrawlCompletedAsync += WebCrawler_PageCrawlCompletedAsync;

            // DEBUG: uncomment to debug Abot crawl progress
            // crawler.PageCrawlStartingAsync += WebCrawler_PageCrawlStartingAsync;

            // DEBUG: uncomment to debug Abot crawling decisions
            // crawler.PageCrawlDisallowedAsync += WebCrawler_PageCrawlDisallowedAsync;
            // crawler.PageLinksCrawlDisallowedAsync += WebCrawler_PageLinksCrawlDisallowedAsync;
        }