C# (CSharp) PoliteWebCrawler.ShouldCrawlPageLinks Examples

Programming Language: C# (CSharp)

Class/Type: PoliteWebCrawler

Method/Function: ShouldCrawlPageLinks

Examples at hotexamples.com: 3

C# (CSharp) PoliteWebCrawler.ShouldCrawlPageLinks - 3 examples found. These are the top rated real world C# (CSharp) examples of PoliteWebCrawler.ShouldCrawlPageLinks extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

CrawlAsync(30)

Crawl(18)

ShouldCrawlPage(4)

ShouldCrawlPageLinks(3)

ShouldDownloadPageContent(2)

Dispose(1)

IsInternalUri(1)

Example #1

Show file

        private static IWebCrawler GetCustomBehaviorUsingLambdaWebCrawler()
        {
            IWebCrawler crawler = new PoliteWebCrawler();

            crawler.ShouldCrawlPage((pageToCrawl, crawlContext) =>
            {
                return(new CrawlDecision {
                    Allow =
                        (pageToCrawl.Uri.AbsoluteUri.StartsWith("https://home.treasury.gov") ||
                         pageToCrawl.Uri.AbsoluteUri.StartsWith("https://www.treasury.gov")) &&
                        !pageToCrawl.Uri.AbsoluteUri.EndsWith(".pdf")
                });
            });


            crawler.ShouldDownloadPageContent((crawledPage, crawlContext) =>
            {
                return(new CrawlDecision {
                    Allow = true
                });
            });

            //Register a lambda expression that will tell Abot to not crawl links on any page that is not internal to the root uri.
            //NOTE: This lambda is run after the regular ICrawlDecsionMaker.ShouldCrawlPageLinks method is run
            crawler.ShouldCrawlPageLinks((crawledPage, crawlContext) =>
            {
                return(new CrawlDecision {
                    Allow = true
                });
            });

            return(crawler);
        }

Example #2

Show file

        public PoliteWebCrawler CreateCrawler()
        {
            _dataFinder = new DataFinder(new KomputronikDataExtractor());

            XmlConfigurator.Configure();

            CrawlConfiguration crawlConfig = AbotConfigurationSectionHandler.LoadFromXml().Convert();

            crawlConfig.MaxConcurrentThreads = 15;//this overrides the config value

            crawlConfig.MaxCrawlDepth = 15;

            //Will use app.config for configuration
            PoliteWebCrawler crawler = new PoliteWebCrawler();

            crawler.ShouldCrawlPage(ShouldCrawlPage);
            crawler.ShouldDownloadPageContent(ShouldCrawlPageContent);
            crawler.ShouldCrawlPageLinks(ShouldCrawlPageLinks);

            crawler.PageCrawlCompletedAsync       += crawler_ProcessPageCrawlCompleted;
            crawler.PageCrawlDisallowedAsync      += crawler_PageCrawlDisallowed;
            crawler.PageLinksCrawlDisallowedAsync += crawler_PageLinksCrawlDisallowed;
            return(crawler);
        }

Example #3

Show file

        private void ConfigureWebCrawler()
        {
            CrawlConfiguration config = new CrawlConfiguration();

            config.MaxConcurrentThreads                       = Environment.ProcessorCount;
            config.MaxPagesToCrawl                            = 0;
            config.MaxPagesToCrawlPerDomain                   = 0;
            config.MaxPageSizeInBytes                         = 0;
            config.UserAgentString                            = "Mozilla/5.0 (Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko";
            config.HttpProtocolVersion                        = HttpProtocolVersion.NotSpecified;
            config.CrawlTimeoutSeconds                        = 0;
            config.IsUriRecrawlingEnabled                     = false;
            config.IsExternalPageCrawlingEnabled              = false;
            config.IsExternalPageLinksCrawlingEnabled         = false;
            config.IsRespectUrlNamedAnchorOrHashbangEnabled   = false;
            config.DownloadableContentTypes                   = "text/html, text/plain";
            config.HttpServicePointConnectionLimit            = 200;
            config.HttpRequestTimeoutInSeconds                = 15;
            config.HttpRequestMaxAutoRedirects                = 7;
            config.IsHttpRequestAutoRedirectsEnabled          = true;
            config.IsHttpRequestAutomaticDecompressionEnabled = true;
            config.IsSendingCookiesEnabled                    = false;
            config.IsSslCertificateValidationEnabled          = false;
            config.MinAvailableMemoryRequiredInMb             = 0;
            config.MaxMemoryUsageInMb                         = 0;
            config.MaxMemoryUsageCacheTimeInSeconds           = 0;
            config.MaxCrawlDepth               = 1000;
            config.MaxLinksPerPage             = 1000;
            config.IsForcedLinkParsingEnabled  = false;
            config.MaxRetryCount               = 0;
            config.MinRetryDelayInMilliseconds = 0;

            config.IsRespectRobotsDotTextEnabled                = true;
            config.UrlPatternsToExclude                         = ExtractorParams.UrlPatternsToExclude;
            config.IsRespectMetaRobotsNoFollowEnabled           = true;
            config.IsRespectHttpXRobotsTagHeaderNoFollowEnabled = true;
            config.IsRespectAnchorRelNoFollowEnabled            = true;
            config.IsIgnoreRobotsDotTextIfRootDisallowedEnabled = false;
            config.RobotsDotTextUserAgentString                 = "bingbot";
            config.MinCrawlDelayPerDomainMilliSeconds           = ExtractorParams.MinCrawlDelay;
            config.MaxRobotsDotTextCrawlDelayInSeconds          = 5;

            config.IsAlwaysLogin         = false;
            config.LoginUser             = "";
            config.LoginPassword         = "";
            config.UseDefaultCredentials = false;

            if (!DoContinue)
            {
                scheduler = new Scheduler(config.IsUriRecrawlingEnabled, null, null);
            }
            else
            {
                using (FileStream fs = new FileStream(Path.Combine(ContentDirectory.FullName, LogsDirName, CheckpointFileName), FileMode.Open))
                {
                    scheduler = Scheduler.Deserialize(fs);
                }
            }
            crawler = new PoliteWebCrawler(config, null, null, scheduler, null, null, null, null, null);
            crawler.IsInternalUri((candidateUri, rootUri) => HtmlFileUtils.ShouldCrawlUri(ExtractorParams.Scope, candidateUri, rootUri));
            crawler.ShouldCrawlPageLinks(WebCrawler_ShouldCrawlPageLinks);
            crawler.PageCrawlCompletedAsync += WebCrawler_PageCrawlCompletedAsync;

            // DEBUG: uncomment to debug Abot crawl progress
            // crawler.PageCrawlStartingAsync += WebCrawler_PageCrawlStartingAsync;

            // DEBUG: uncomment to debug Abot crawling decisions
            // crawler.PageCrawlDisallowedAsync += WebCrawler_PageCrawlDisallowedAsync;
            // crawler.PageLinksCrawlDisallowedAsync += WebCrawler_PageLinksCrawlDisallowedAsync;
        }